From 1d59cbb06d0e1446c1d81c3ea3fc38ff13d4cdcb Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Thu, 26 Jun 2025 09:03:18 -0400 Subject: [PATCH] Add support for MI 100 with rocprofiler-sdk (#768) * Add custom rocprofiler-sdk counter definitions file for MI 100 * Update CHANGELOG to mention that accumulation counters will not be collected when profiling on MI 100 using rocprofiler-sdk/rocprofv3 * Migrate accum_counters.yaml to code [ROCm/rocprofiler-compute commit: a95a45d69a01491dcee65c68a17b190403889fdb] --- projects/rocprofiler-compute/CHANGELOG.md | 5 +- .../rocprof_compute_profile/profiler_base.py | 1 - .../profile_configs/accum_counters.yaml | 58 - .../profile_configs/gfx908_counter_defs.yaml | 2841 +++++++++++++++++ .../src/rocprof_compute_soc/soc_base.py | 94 +- .../src/rocprof_compute_soc/soc_gfx908.py | 2 +- .../rocprofiler-compute/src/utils/utils.py | 173 +- 7 files changed, 2974 insertions(+), 200 deletions(-) delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/accum_counters.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index ed65346f98..1682d155ff 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -81,6 +81,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Resolved issues +* Fixed MI 100 counters not being collected when rocprofv3 is used * Fixed option specs-correction * Fixed kernel name and kernel dispatch filtering when using rocprof v3 * Fixed not collecting TCC channel counters in rocprof v3 @@ -88,7 +89,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Known issues -* Profiling on MI 100 will not work unless ROCPROF=rocprofv1 environment variable is explictly provided +* On MI 100, accumulation counters will not be collected and the following metrics will not show up in analysis: Instruction Fetch Latency, Wavefront Occupancy, LDS Latency + * As a workaround, use ROCPROF=rocprof environement variable, to use rocprofv1 for profiling on MI 100 + * GPU id filtering is not supported when using rocprof v3 * Analysis of previously collected workload data will not work due to sysinfo.csv schema change diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index d275379b95..bbc83fd7b6 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -33,7 +33,6 @@ from pathlib import Path import pandas as pd -import config from utils.logger import ( console_debug, console_error, diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/accum_counters.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/accum_counters.yaml deleted file mode 100644 index 649188eb48..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/accum_counters.yaml +++ /dev/null @@ -1,58 +0,0 @@ -rocprofiler-sdk: - counters-schema-version: 1 - counters: - - name: SQ_IFETCH_LEVEL_ACCUM - description: 'SQ_IFETCH_LEVEL accumulation' - properties: [] - definitions: - - architectures: - - gfx942 - - gfx941 - - gfx940 - - gfx90a - - gfx950 - expression: accumulate(SQ_IFETCH_LEVEL, HIGH_RES) - - name: SQ_INST_LEVEL_LDS_ACCUM - description: 'SQ_INST_LEVEL_LDS accumulation' - properties: [] - definitions: - - architectures: - - gfx942 - - gfx941 - - gfx940 - - gfx90a - - gfx950 - expression: accumulate(SQ_INST_LEVEL_LDS, HIGH_RES) - - name: SQ_INST_LEVEL_SMEM_ACCUM - description: 'SQ_INST_LEVEL_SMEM accumulation' - properties: [] - definitions: - - architectures: - - gfx942 - - gfx941 - - gfx940 - - gfx90a - - gfx950 - expression: accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES) - - name: SQ_INST_LEVEL_VMEM_ACCUM - description: 'SQ_INST_LEVEL_VMEM accumulation' - properties: [] - definitions: - - architectures: - - gfx942 - - gfx941 - - gfx940 - - gfx90a - - gfx950 - expression: accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES) - - name: SQ_LEVEL_WAVES_ACCUM - description: 'SQ_LEVEL_WAVES accumulation' - properties: [] - definitions: - - architectures: - - gfx942 - - gfx941 - - gfx940 - - gfx90a - - gfx950 - expression: accumulate(SQ_LEVEL_WAVES, HIGH_RES) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml new file mode 100644 index 0000000000..644a8843be --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml @@ -0,0 +1,2841 @@ +rocprofiler-sdk: + counters-schema-version: 1 + counters: + - name: CPC_ME1_BUSY_FOR_PACKET_DECODE + description: Me1 busy for packet decode. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 13 + - name: CPC_UTCL1_STALL_ON_TRANSLATION + description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING + response. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 24 + - name: CPC_CPC_STAT_BUSY + description: CPC Busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 25 + - name: CPC_CPC_STAT_IDLE + description: CPC Idle. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 26 + - name: CPC_CPC_STAT_STALL + description: CPC Stalled. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 27 + - name: CPC_CPC_TCIU_BUSY + description: CPC TCIU interface Busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 28 + - name: CPC_CPC_TCIU_IDLE + description: CPC TCIU interface Idle. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 29 + - name: CPC_CPC_UTCL2IU_BUSY + description: CPC UTCL2 interface Busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 30 + - name: CPC_CPC_UTCL2IU_IDLE + description: CPC UTCL2 interface Idle. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 31 + - name: CPC_CPC_UTCL2IU_STALL + description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 32 + - name: CPC_ME1_DC0_SPI_BUSY + description: CPC Me1 Processor Busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPC + event: 33 + - name: CPF_CMP_UTCL1_STALL_ON_TRANSLATION + description: One of the Compute UTCL1s is stalled waiting on translation, XNACK + or PENDING response. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 20 + - name: CPF_CPF_STAT_BUSY + description: CPF Busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 23 + - name: CPF_CPF_STAT_IDLE + description: CPF Idle. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 24 + - name: CPF_CPF_STAT_STALL + description: CPF Stalled. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 25 + - name: CPF_CPF_TCIU_BUSY + description: CPF TCIU interface Busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 26 + - name: CPF_CPF_TCIU_IDLE + description: CPF TCIU interface Idle. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 27 + - name: CPF_CPF_TCIU_STALL + description: CPF TCIU interface Stalled waiting on Free, Tags. + properties: [] + definitions: + - architectures: + - gfx908 + block: CPF + event: 28 + - name: GRBM_COUNT + description: Tie High - Count Number of Clocks + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 0 + - name: GRBM_GUI_ACTIVE + description: The GUI is Active + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 2 + - name: GRBM_CP_BUSY + description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 3 + - name: GRBM_SPI_BUSY + description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader + engine(s). + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 11 + - name: GRBM_TA_BUSY + description: Any of the Texture Pipes (TA) are busy in the shader engine(s). + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 13 + - name: GRBM_TC_BUSY + description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 28 + - name: GRBM_CPC_BUSY + description: The Command Processor Compute (CPC) is busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 30 + - name: GRBM_CPF_BUSY + description: The Command Processor Fetchers (CPF) is busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 31 + - name: GRBM_UTCL2_BUSY + description: The Unified Translation Cache Level-2 (UTCL2) block is busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 34 + - name: GRBM_EA_BUSY + description: The Efficiency Arbiter (EA) block is busy. + properties: [] + definitions: + - architectures: + - gfx908 + block: GRBM + event: 35 + - name: SPI_CSN_WINDOW_VALID + description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL + to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source + is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 47 + - name: SPI_CSN_BUSY + description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL + to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source + is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 48 + - name: SPI_CSN_NUM_THREADGROUPS + description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL + to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source + is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 49 + - name: SPI_CSN_WAVE + description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select + source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; + DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 52 + - name: SPI_RA_REQ_NO_ALLOC + description: Arb cycles with requests but no allocation. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 79 + - name: SPI_RA_REQ_NO_ALLOC_CSN + description: Arb cycles with CSn req and no CSn alloc. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 85 + - name: SPI_RA_RES_STALL_CSN + description: Arb cycles with CSn req and no CSn fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 91 + - name: SPI_RA_TMP_STALL_CSN + description: Cycles where csn wants to req but does not fit in temp space. + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 97 + - name: SPI_RA_WAVE_SIMD_FULL_CSN + description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is + RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 103 + - name: SPI_RA_VGPR_SIMD_FULL_CSN + description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is + RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 109 + - name: SPI_RA_SGPR_SIMD_FULL_CSN + description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is + RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 115 + - name: SPI_RA_LDS_CU_FULL_CSN + description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 120 + - name: SPI_RA_BAR_CU_FULL_CSN + description: Sum of CU where BARRIER can't take csn wave when !fits. Source is + RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 123 + - name: SPI_RA_BULKY_CU_FULL_CSN + description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 125 + - name: SPI_RA_TGLIM_CU_FULL_CSN + description: Cycles where csn wants to req but all CU are at tg_limit + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 127 + - name: SPI_RA_WVLIM_STALL_CSN + description: Number of clocks csn is stalled due to WAVE LIMIT. + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 133 + - name: SPI_SWC_CSC_WR + description: Number of clocks to write CSC waves to SGPRs (need to multiply this + value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL + = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source + is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 189 + - name: SPI_VWC_CSC_WR + description: Number of clocks to write CSC waves to VGPRs (need to multiply this + value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL + = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source + is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx908 + block: SPI + event: 195 + - name: SQ_ACCUM_PREV + description: For counter N, increment by the value of counter N-1. Only accumulates + once every 4 cycles. + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 1 + - name: SQ_CYCLES + description: Clock cycles. (nondeterministic, per-simd, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 2 + - name: SQ_BUSY_CYCLES + description: Clock cycles while SQ is reporting that it is busy. (nondeterministic, + per-simd, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 3 + - name: SQ_WAVES + description: Count number of waves sent to SQs. (per-simd, emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 4 + - name: SQ_LEVEL_WAVES + description: Track the number of waves. Set ACCUM_PREV for the next counter to + use this. (level, per-simd, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 5 + - name: SQ_WAVES_EQ_64 + description: Count number of waves with exactly 64 active threads sent to SQs. + (per-simd, emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 6 + - name: SQ_WAVES_LT_64 + description: Count number of waves with <64 active threads sent to SQs. (per-simd, + emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 7 + - name: SQ_WAVES_LT_48 + description: Count number of waves with <48 active threads sent to SQs. (per-simd, + emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 8 + - name: SQ_WAVES_LT_32 + description: Count number of waves sent <32 active threads sent to SQs. (per-simd, + emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 9 + - name: SQ_WAVES_LT_16 + description: Count number of waves sent <16 active threads sent to SQs. (per-simd, + emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 10 + - name: SQ_BUSY_CU_CYCLES + description: Count quad-cycles each CU is busy. (nondeterministic, per-simd) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 13 + - name: SQ_ITEMS + description: Number of valid items per wave. (per-simd, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 14 + - name: SQ_INSTS + description: Number of instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 25 + - name: SQ_INSTS_VALU + description: Number of VALU instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 26 + - name: SQ_INSTS_MFMA + description: Number of MFMA instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 27 + - name: SQ_INSTS_VMEM_WR + description: Number of VMEM write instructions issued (including FLAT). (per-simd, + emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 28 + - name: SQ_INSTS_VMEM_RD + description: Number of VMEM read instructions issued (including FLAT). (per-simd, + emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 29 + - name: SQ_INSTS_VMEM + description: Number of VMEM instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 30 + - name: SQ_INSTS_SALU + description: Number of SALU instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 31 + - name: SQ_INSTS_SMEM + description: Number of SMEM instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 32 + - name: SQ_INSTS_FLAT + description: Number of FLAT instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 33 + - name: SQ_INSTS_FLAT_LDS_ONLY + description: Number of FLAT instructions issued that read/wrote only from/to LDS + (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 34 + - name: SQ_INSTS_LDS + description: Number of LDS instructions issued (including FLAT). (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 35 + - name: SQ_INSTS_GDS + description: Number of GDS instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 36 + - name: SQ_INSTS_EXP_GDS + description: Number of EXP and GDS instructions issued, excluding skipped export + instructions. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 38 + - name: SQ_INSTS_BRANCH + description: Number of Branch instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 39 + - name: SQ_INSTS_SENDMSG + description: Number of Sendmsg instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 40 + - name: SQ_INSTS_VSKIPPED + description: Number of vector instructions skipped. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 41 + - name: SQ_INST_LEVEL_VMEM + description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV + and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd, + level, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 42 + - name: SQ_INST_LEVEL_SMEM + description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; + *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM + for average latency per smem request. Falls slightly short of total request + latency because some fetches are divided into two requests that may finish at + different times and this counter collects the average latency of the two. (per-simd, + level, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 43 + - name: SQ_INST_LEVEL_LDS + description: Number of in-flight LDS instructions. Set next counter to ACCUM_PREV + and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd, + level, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 44 + - name: SQ_WAVE_CYCLES + description: Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 47 + - name: SQ_WAIT_ANY + description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 58 + - name: SQ_WAIT_INST_ANY + description: Number of wave-cycles spent waiting for any instruction issue. In + units of 4 cycles. (per-simd, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 61 + - name: SQ_WAIT_INST_LDS + description: Number of wave-cycles spent waiting for LDS instruction issue. In + units of 4 cycles. (per-simd, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 64 + - name: SQ_ACTIVE_INST_ANY + description: Number of cycles each wave is working on an instruction. (per-simd, + emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 69 + - name: SQ_ACTIVE_INST_VMEM + description: Number of cycles the SQ instruction arbiter is working on a VMEM + instruction. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 70 + - name: SQ_ACTIVE_INST_LDS + description: Number of cycles the SQ instruction arbiter is working on a LDS instruction. + (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 71 + - name: SQ_ACTIVE_INST_VALU + description: Number of cycles the SQ instruction arbiter is working on a VALU + instruction. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 72 + - name: SQ_ACTIVE_INST_SCA + description: Number of cycles the SQ instruction arbiter is working on a SALU + or SMEM instruction. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 73 + - name: SQ_ACTIVE_INST_EXP_GDS + description: Number of cycles the SQ instruction arbiter is working on an EXPORT + or GDS instruction. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 74 + - name: SQ_ACTIVE_INST_MISC + description: Number of cycles the SQ instruction aribter is working on a BRANCH + or SENDMSG instruction. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 75 + - name: SQ_ACTIVE_INST_FLAT + description: Number of cycles the SQ instruction arbiter is working on a FLAT + instruction. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 76 + - name: SQ_INST_CYCLES_VMEM_WR + description: Number of cycles needed to send addr and cmd data for VMEM write + instructions. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 77 + - name: SQ_INST_CYCLES_VMEM_RD + description: Number of cycles needed to send addr and cmd data for VMEM read instructions. + (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 78 + - name: SQ_INST_CYCLES_SMEM + description: Number of cycles needed to execute scalar memory reads. (per-simd, + emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 84 + - name: SQ_INST_CYCLES_SALU + description: Number of cycles needed to execute non-memory read scalar operations. + (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 85 + - name: SQ_THREAD_CYCLES_VALU + description: 'Number of thread-cycles used to execute VALU operations (similar + to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)' + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 86 + - name: SQ_IFETCH + description: Number of instruction fetch requests from cache. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 88 + - name: SQ_IFETCH_LEVEL + description: Number of instruction fetch requests from cache. (per-simd, level) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 89 + - name: SQ_LDS_BANK_CONFLICT + description: Number of cycles LDS is stalled by bank conflicts. (emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 94 + - name: SQ_LDS_ADDR_CONFLICT + description: Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 95 + - name: SQ_LDS_UNALIGNED_STALL + description: Number of cycles LDS is stalled processing flat unaligned load/store + ops. (emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 96 + - name: SQ_LDS_MEM_VIOLATIONS + description: Number of threads that have a memory violation in the LDS.(emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 97 + - name: SQ_LDS_ATOMIC_RETURN + description: Number of atomic return cycles in LDS. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 98 + - name: SQ_LDS_IDX_ACTIVE + description: Number of cycles LDS is used for indexed (non-direct,non-interpolation) + operations. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 99 + - name: SQ_ACCUM_PREV_HIRES + description: For counter N, increment by the value of counter N-1. + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 158 + - name: SQ_WAVES_RESTORED + description: Count number of context-restored waves sent to SQs. (per-simd, emulated, + global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 159 + - name: SQ_WAVES_SAVED + description: Count number of context-saved waves. (per-simd, emulated, global) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 160 + - name: SQ_INSTS_SMEM_NORM + description: Number of SMEM instructions issued normalized to match smem_level + (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 161 + - name: SQC_DCACHE_INPUT_VALID_READYB + description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 260 + - name: SQC_TC_REQ + description: Total number of TC requests that were issued by instruction and constant + caches. (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 262 + - name: SQC_TC_INST_REQ + description: Number of insruction requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 263 + - name: SQC_TC_DATA_READ_REQ + description: Number of data read requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 264 + - name: SQC_TC_DATA_WRITE_REQ + description: Number of data write requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 265 + - name: SQC_TC_DATA_ATOMIC_REQ + description: Number of data atomic requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 266 + - name: SQC_TC_STALL + description: Valid request stalled TC request interface (no-credits). (No-Masking, + nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 267 + - name: SQC_ICACHE_REQ + description: Number of requests. (per-SQ, per-Bank) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 270 + - name: SQC_ICACHE_HITS + description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 271 + - name: SQC_ICACHE_MISSES + description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, + nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 272 + - name: SQC_ICACHE_MISSES_DUPLICATE + description: Number of misses that were duplicates (access to a non-resident, + miss pending CL). (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 273 + - name: SQC_DCACHE_REQ + description: Number of requests (post-bank-serialization). (per-SQ, per-Bank) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 290 + - name: SQC_DCACHE_HITS + description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 291 + - name: SQC_DCACHE_MISSES + description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, + nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 292 + - name: SQC_DCACHE_MISSES_DUPLICATE + description: Number of misses that were duplicates (access to a non-resident, + miss pending CL). (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 293 + - name: SQC_DCACHE_ATOMIC + description: Number of atomic requests. (per-SQ, per-Bank) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 298 + - name: SQC_DCACHE_REQ_READ_1 + description: Number of constant cache 1 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 323 + - name: SQC_DCACHE_REQ_READ_2 + description: Number of constant cache 2 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 324 + - name: SQC_DCACHE_REQ_READ_4 + description: Number of constant cache 4 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 325 + - name: SQC_DCACHE_REQ_READ_8 + description: Number of constant cache 8 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 326 + - name: SQC_DCACHE_REQ_READ_16 + description: Number of constant cache 16 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx908 + block: SQ + event: 327 + - name: TA_TA_BUSY + description: TA block is busy. Perf_Windowing not supported for this counter. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 15 + - name: TA_TOTAL_WAVEFRONTS + description: Total number of wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 32 + - name: TA_BUFFER_WAVEFRONTS + description: Number of buffer wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 44 + - name: TA_BUFFER_READ_WAVEFRONTS + description: Number of buffer read wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 45 + - name: TA_BUFFER_WRITE_WAVEFRONTS + description: Number of buffer write wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 46 + - name: TA_BUFFER_ATOMIC_WAVEFRONTS + description: Number of buffer atomic wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 47 + - name: TA_BUFFER_TOTAL_CYCLES + description: Number of buffer cycles issued to TC. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 49 + - name: TA_BUFFER_COALESCED_READ_CYCLES + description: Number of buffer coalesced read cycles issued to TC. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 52 + - name: TA_BUFFER_COALESCED_WRITE_CYCLES + description: Number of buffer coalesced write cycles issued to TC. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 53 + - name: TA_ADDR_STALLED_BY_TC_CYCLES + description: Number of cycles addr path stalled by TC. Perf_Windowing not supported + for this counter. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 54 + - name: TA_ADDR_STALLED_BY_TD_CYCLES + description: Number of cycles addr path stalled by TD. Perf_Windowing not supported + for this counter. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 55 + - name: TA_DATA_STALLED_BY_TC_CYCLES + description: Number of cycles data path stalled by TC. Perf_Windowing not supported + for this counter. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 56 + - name: TA_FLAT_WAVEFRONTS + description: Number of flat opcode wavfronts processed by the TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 100 + - name: TA_FLAT_READ_WAVEFRONTS + description: Number of flat opcode reads processed by the TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 101 + - name: TA_FLAT_WRITE_WAVEFRONTS + description: Number of flat opcode writes processed by the TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 102 + - name: TA_FLAT_ATOMIC_WAVEFRONTS + description: Number of flat opcode atomics processed by the TA. + properties: [] + definitions: + - architectures: + - gfx908 + block: TA + event: 103 + - name: TCA_CYCLE + description: Number of cycles. Not windowable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCA + event: 1 + - name: TCA_BUSY + description: Number of cycles we have a request pending. Not windowable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCA + event: 2 + - name: TCC_CYCLE + description: Number of cycles. Not windowable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 1 + - name: TCC_BUSY + description: Number of cycles we have a request pending. Not windowable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 2 + - name: TCC_REQ + description: Number of requests of all types. This is measured at the tag block. + This may be more than the number of requests arriving at the TCC, but it is + a good indication of the total amount of work that needs to be performed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 3 + - name: TCC_STREAMING_REQ + description: Number of streaming requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 4 + - name: TCC_NC_REQ + description: The number of noncoherently cached requests. This is measured at + the tag block. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 5 + - name: TCC_UC_REQ + description: The number of uncached requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 6 + - name: TCC_CC_REQ + description: The number of coherently cached requests. This is measured at the + tag block. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 7 + - name: TCC_RW_REQ + description: The number of RW requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 8 + - name: TCC_PROBE + description: Number of probe requests. Not windowable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 9 + - name: TCC_PROBE_ALL + description: Number of external probe requests with with EA_TCC_preq_all== 1. + Not windowable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 10 + - name: TCC_READ + description: Number of read requests. Compressed reads are included in this, but + metadata reads are not included. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 12 + - name: TCC_WRITE + description: Number of write requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 13 + - name: TCC_ATOMIC + description: Number of atomic requests of all types. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 14 + - name: TCC_HIT + description: Number of cache hits. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 17 + - name: TCC_MISS + description: Number of cache misses. UC reads count as misses. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 19 + - name: TCC_WRITEBACK + description: Number of lines written back to main memory. This includes writebacks + of dirty lines and uncached write/atomic requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 22 + - name: TCC_EA_WRREQ + description: Number of transactions (either 32-byte or 64-byte) going over the + TC_EA_wrreq interface. Atomics may travel over the same interface and are generally + classified as write requests. This does not include probe commands. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 26 + - name: TCC_EA_WRREQ_64B + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over + the TC_EA_wrreq interface. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 27 + - name: TCC_EA_WR_UNCACHED_32B + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface + due to uncached traffic. Note that CC mtypes can produce uncached requests, + and those are included in this. A 64-byte request will be counted as 2 + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 29 + - name: TCC_EA_WRREQ_STALL + description: Number of cycles a write request was stalled. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 30 + - name: TCC_EA_WRREQ_IO_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface + was out of IO credits. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 31 + - name: TCC_EA_WRREQ_GMI_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface + was out of GMI credits. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 32 + - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface + was out of DRAM credits. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 33 + - name: TCC_TOO_MANY_EA_WRREQS_STALL + description: Number of cycles the TCC could not send a EA write request because + it already reached its maximum number of pending EA write requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 34 + - name: TCC_EA_WRREQ_LEVEL + description: The sum of the number of EA write requests in flight. This is primarily + meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 35 + - name: TCC_EA_ATOMIC + description: Number of transactions going over the TC_EA_wrreq interface that + are actually atomic requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 36 + - name: TCC_EA_ATOMIC_LEVEL + description: The sum of the number of EA atomics in flight. This is primarily + meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 37 + - name: TCC_EA_RDREQ + description: Number of TCC/EA read requests (either 32-byte or 64-byte) + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 38 + - name: TCC_EA_RDREQ_32B + description: Number of 32-byte TCC/EA read requests + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 39 + - name: TCC_EA_RD_UNCACHED_32B + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte + request will be counted as 2 + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 40 + - name: TCC_EA_RDREQ_IO_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface + was out of IO credits. Stalls occur regardless of whether a read needed to be + performed or not. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 41 + - name: TCC_EA_RDREQ_GMI_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface + was out of GMI credits. Stalls occur regardless of whether a read needed to + be performed or not. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 42 + - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface + was out of DRAM credits. Stalls occur regardless of whether a read needed to + be performed or not. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 43 + - name: TCC_EA_RDREQ_LEVEL + description: The sum of the number of TCC/EA read requests in flight. This is + primarily meant for measure average EA read latency. Average read latency = + TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 44 + - name: TCC_TAG_STALL + description: Number of cycles the normal request pipeline in the tag was stalled + for any reason. Normally, stalls of this nature are measured exactly from one + point the pipeline, but that is not the case for this counter. Probes can stall + the pipeline at a variety of places, and there is no single point that can reasonably + measure the total stalls accurately. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 45 + - name: TCC_NORMAL_WRITEBACK + description: Number of writebacks due to requests that are not writeback requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 68 + - name: TCC_ALL_TC_OP_WB_WRITEBACK + description: Number of writebacks due to all TC_OP writeback requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 73 + - name: TCC_NORMAL_EVICT + description: Number of evictions due to requests that are not invalidate or probe + requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 74 + - name: TCC_ALL_TC_OP_INV_EVICT + description: Number of evictions due to all TC_OP invalidate requests. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 80 + - name: TCC_EA_RDREQ_DRAM + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined + for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 102 + - name: TCC_EA_WRREQ_DRAM + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined + for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 103 + - name: TCC_CLIENT184_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 312 + - name: TCC_CLIENT185_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 313 + - name: TCC_CLIENT186_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 314 + - name: TCC_CLIENT187_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 315 + - name: TCC_CLIENT188_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 316 + - name: TCC_CLIENT189_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 317 + - name: TCC_CLIENT190_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 318 + - name: TCC_CLIENT191_REQ + description: '' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 319 + - name: TCP_GATE_EN1 + description: TCP interface clocks are turned on. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 0 + - name: TCP_GATE_EN2 + description: TCP core clocks are turned on. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 1 + - name: TCP_TCP_TA_DATA_STALL_CYCLES + description: TCP stalls TA data interface. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 6 + - name: TCP_TD_TCP_STALL_CYCLES + description: TD stalls TCP + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 7 + - name: TCP_TCR_TCP_STALL_CYCLES + description: TCR stalls TCP_TCR_req interface + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 8 + - name: TCP_READ_TAGCONFLICT_STALL_CYCLES + description: Tagram conflict stall on a read + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 11 + - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES + description: Tagram conflict stall on a write + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 12 + - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES + description: Tagram conflict stall on an atomic + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 13 + - name: TCP_PENDING_STALL_CYCLES + description: Stall due to data pending from L2 + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 22 + - name: TCP_TA_TCP_STATE_READ + description: Number of state reads + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 27 + - name: TCP_VOLATILE + description: Total number of L1 volatile pixels/buffers from TA + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 28 + - name: TCP_TOTAL_ACCESSES + description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 29 + - name: TCP_TOTAL_READ + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 30 + - name: TCP_TOTAL_WRITE + description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ + TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 32 + - name: TCP_TOTAL_ATOMIC_WITH_RET + description: Total number of atomic with return pixels/buffers from TA + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 38 + - name: TCP_TOTAL_ATOMIC_WITHOUT_RET + description: Total number of atomic without return pixels/buffers from TA + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 39 + - name: TCP_TOTAL_WRITEBACK_INVALIDATES + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ + TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. + Not Windowed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 45 + - name: TCP_UTCL1_REQUEST + description: Total CLIENT_UTCL1 NORMAL requests + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 47 + - name: TCP_UTCL1_TRANSLATION_MISS + description: Total utcl1 translation misses + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 48 + - name: TCP_UTCL1_TRANSLATION_HIT + description: Total utcl1 translation hits + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 49 + - name: TCP_UTCL1_PERMISSION_MISS + description: Total utcl1 permission misses + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 50 + - name: TCP_TOTAL_CACHE_ACCESSES + description: Count of total cache line (tag) accesses (includes hits and misses). + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 60 + - name: TCP_TCP_LATENCY + description: Total TCP wave latency (from first clock of wave entering to first + clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 65 + - name: TCP_TCC_READ_REQ_LATENCY + description: Total TCP->TCC request latency for reads and atomics with return. + Not Windowed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 66 + - name: TCP_TCC_WRITE_REQ_LATENCY + description: Total TCP->TCC request latency for writes and atomics without return. + Not Windowed. + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 67 + - name: TCP_TCC_READ_REQ + description: Total read requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 69 + - name: TCP_TCC_WRITE_REQ + description: Total write requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 70 + - name: TCP_TCC_ATOMIC_WITH_RET_REQ + description: Total atomic with return requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 71 + - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ + description: Total atomic without return requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 72 + - name: TCP_TCC_NC_READ_REQ + description: Total read requests with NC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 75 + - name: TCP_TCC_NC_WRITE_REQ + description: Total write requests with NC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 76 + - name: TCP_TCC_NC_ATOMIC_REQ + description: Total atomic requests with NC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 77 + - name: TCP_TCC_UC_READ_REQ + description: Total read requests with UC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 78 + - name: TCP_TCC_UC_WRITE_REQ + description: Total write requests with UC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 79 + - name: TCP_TCC_UC_ATOMIC_REQ + description: Total atomic requests with UC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 80 + - name: TCP_TCC_CC_READ_REQ + description: Total write requests with CC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 81 + - name: TCP_TCC_CC_WRITE_REQ + description: Total write requests with CC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 82 + - name: TCP_TCC_CC_ATOMIC_REQ + description: Total atomic requests with CC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 83 + - name: TCP_TCC_RW_READ_REQ + description: Total write requests with RW mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 85 + - name: TCP_TCC_RW_WRITE_REQ + description: Total write requests with RW mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 86 + - name: TCP_TCC_RW_ATOMIC_REQ + description: Total atomic requests with RW mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx908 + block: TCP + event: 87 + - name: TD_TD_BUSY + description: TD is processing or waiting for data. Perf_Windowing not supported + for this counter. + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 1 + - name: TD_TC_STALL + description: TD is stalled waiting for TC data. + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 15 + - name: TD_RESERVED_18 + description: RESERVED_18 + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 18 + - name: TD_LOAD_WAVEFRONT + description: Count the wavefronts with opcode = load, include atomics and store. + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 25 + - name: TD_ATOMIC_WAVEFRONT + description: Count the wavefronts with opcode = atomic. + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 26 + - name: TD_STORE_WAVEFRONT + description: Count the wavefronts with opcode = store. + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 27 + - name: TD_COALESCABLE_WAVEFRONT + description: Count wavefronts that TA finds coalescable. + properties: [] + definitions: + - architectures: + - gfx908 + block: TD + event: 32 + - name: TA_BUSY_avr + description: TA block is busy. Average over TA instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_TA_BUSY,avr) + - name: TA_BUSY_max + description: TA block is busy. Max over TA instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_TA_BUSY,max) + - name: TA_BUSY_min + description: TA block is busy. Min over TA instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_TA_BUSY,min) + - name: TA_FLAT_READ_WAVEFRONTS_sum + description: Number of flat opcode reads processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum) + - name: TA_FLAT_WRITE_WAVEFRONTS_sum + description: Number of flat opcode writes processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum) + - name: TCC_BUSY_avr + description: TCC_BUSY avr over all memory channels. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_BUSY,avr) + - name: TCC_REQ_sum + description: TCC_REQ sum over all memory channels. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_REQ,sum) + - name: TCC_HIT_sum + description: Number of cache hits. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_HIT,sum) + - name: TCC_MISS_sum + description: Number of cache misses. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_MISS,sum) + - name: TCC_EA_RDREQ_32B_sum + description: Number of 32-byte TCC/EA read requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ_32B,sum) + - name: TCC_EA_RDREQ_sum + description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over + TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ,sum) + - name: TCC_EA_WRREQ_sum + description: Number of transactions (either 32-byte or 64-byte) going over the + TC_EA_wrreq interface. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ,sum) + - name: TCC_EA_WRREQ_64B_sum + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over + the TC_EA_wrreq interface. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_64B,sum) + - name: TCC_WRREQ_STALL_max + description: Number of cycles a write request was stalled. Max over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_STALL,max) + - name: TCC_CYCLE_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_CYCLE,sum) + - name: TCC_BUSY_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_BUSY,sum) + - name: TCC_STREAMING_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_STREAMING_REQ,sum) + - name: TCC_NC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_NC_REQ,sum) + - name: TCC_UC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_UC_REQ,sum) + - name: TCC_CC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_CC_REQ,sum) + - name: TCC_RW_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_RW_REQ,sum) + - name: TCC_PROBE_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_PROBE,sum) + - name: TCC_PROBE_ALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_PROBE_ALL,sum) + - name: TCC_READ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_READ,sum) + - name: TCC_WRITE_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_WRITE,sum) + - name: TCC_ATOMIC_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_ATOMIC,sum) + - name: TCC_TAG_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_TAG_STALL,sum) + - name: TCC_WRITEBACK_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_WRITEBACK,sum) + - name: TCC_EA_WR_UNCACHED_32B_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WR_UNCACHED_32B,sum) + - name: TCC_EA_WRREQ_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_STALL,sum) + - name: TCC_EA_WRREQ_IO_CREDIT_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_IO_CREDIT_STALL,sum) + - name: TCC_EA_WRREQ_GMI_CREDIT_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_GMI_CREDIT_STALL,sum) + - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_DRAM_CREDIT_STALL,sum) + - name: TCC_TOO_MANY_EA_WRREQS_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum) + - name: TCC_EA_WRREQ_LEVEL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_LEVEL,sum) + - name: TCC_EA_RDREQ_LEVEL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ_LEVEL,sum) + - name: TCC_EA_ATOMIC_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_ATOMIC,sum) + - name: TCC_EA_ATOMIC_LEVEL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_ATOMIC_LEVEL,sum) + - name: TCC_EA_RD_UNCACHED_32B_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RD_UNCACHED_32B,sum) + - name: TCC_EA_RDREQ_IO_CREDIT_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum) + - name: TCC_EA_RDREQ_GMI_CREDIT_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum) + - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum) + - name: TCC_NORMAL_WRITEBACK_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_NORMAL_WRITEBACK,sum) + - name: TCC_ALL_TC_OP_WB_WRITEBACK_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) + - name: TCC_NORMAL_EVICT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_NORMAL_EVICT,sum) + - name: TCC_ALL_TC_OP_INV_EVICT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum) + - name: TCC_EA_RDREQ_DRAM_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_RDREQ_DRAM,sum) + - name: TCC_EA_WRREQ_DRAM_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCC_EA_WRREQ_DRAM,sum) + - name: FETCH_SIZE + description: The total kilobytes fetched from the video memory. This is measured + with all extra fetches and any cache or memory effects taken into account. + properties: [] + definitions: + - architectures: + - gfx908 + expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 + - name: WRITE_SIZE + description: The total kilobytes written to the video memory. This is measured + with all extra fetches and any cache or memory effects taken into account. + properties: [] + definitions: + - architectures: + - gfx908 + expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 + - name: WRITE_REQ_32B + description: The total number of 32-byte effective memory writes. + properties: [] + definitions: + - architectures: + - gfx908 + expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) + - name: TA_TA_BUSY_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_TA_BUSY,sum) + - name: TA_TOTAL_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_TOTAL_WAVEFRONTS,sum) + - name: TA_ADDR_STALLED_BY_TC_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum) + - name: TA_ADDR_STALLED_BY_TD_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum) + - name: TA_DATA_STALLED_BY_TC_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum) + - name: TA_FLAT_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_FLAT_WAVEFRONTS,sum) + - name: TA_FLAT_ATOMIC_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum) + - name: TA_BUFFER_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_WAVEFRONTS,sum) + - name: TA_BUFFER_READ_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum) + - name: TA_BUFFER_WRITE_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum) + - name: TA_BUFFER_ATOMIC_WAVEFRONTS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum) + - name: TA_BUFFER_TOTAL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum) + - name: TA_BUFFER_COALESCED_READ_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum) + - name: TA_BUFFER_COALESCED_WRITE_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum) + - name: TD_TD_BUSY_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TD_TD_BUSY,sum) + - name: TD_TC_STALL_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TD_TC_STALL,sum) + - name: TD_LOAD_WAVEFRONT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TD_LOAD_WAVEFRONT,sum) + - name: TD_ATOMIC_WAVEFRONT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TD_ATOMIC_WAVEFRONT,sum) + - name: TD_STORE_WAVEFRONT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TD_STORE_WAVEFRONT,sum) + - name: TD_COALESCABLE_WAVEFRONT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TD_COALESCABLE_WAVEFRONT,sum) + - name: TCP_GATE_EN1_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_GATE_EN1,sum) + - name: TCP_GATE_EN2_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_GATE_EN2,sum) + - name: TCP_TCP_TA_DATA_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum) + - name: TCP_TD_TCP_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum) + - name: TCP_TCR_TCP_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum) + - name: TCP_READ_TAGCONFLICT_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum) + - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum) + - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum) + - name: TCP_PENDING_STALL_CYCLES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_PENDING_STALL_CYCLES,sum) + - name: TCP_VOLATILE_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_VOLATILE,sum) + - name: TCP_TOTAL_ACCESSES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_ACCESSES,sum) + - name: TCP_TOTAL_READ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_READ,sum) + - name: TCP_TOTAL_WRITE_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_WRITE,sum) + - name: TCP_TOTAL_ATOMIC_WITH_RET_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum) + - name: TCP_TOTAL_ATOMIC_WITHOUT_RET_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum) + - name: TCP_TOTAL_WRITEBACK_INVALIDATES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum) + - name: TCP_UTCL1_REQUEST_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_UTCL1_REQUEST,sum) + - name: TCP_UTCL1_TRANSLATION_MISS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum) + - name: TCP_UTCL1_TRANSLATION_HIT_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum) + - name: TCP_UTCL1_PERMISSION_MISS_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum) + - name: TCP_TOTAL_CACHE_ACCESSES_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum) + - name: TCP_TCP_LATENCY_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCP_LATENCY,sum) + - name: TCP_TA_TCP_STATE_READ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TA_TCP_STATE_READ,sum) + - name: TCP_TCC_READ_REQ_LATENCY_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum) + - name: TCP_TCC_WRITE_REQ_LATENCY_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum) + - name: TCP_TCC_READ_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_READ_REQ,sum) + - name: TCP_TCC_WRITE_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_WRITE_REQ,sum) + - name: TCP_TCC_ATOMIC_WITH_RET_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum) + - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum) + - name: TCP_TCC_NC_READ_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_NC_READ_REQ,sum) + - name: TCP_TCC_NC_WRITE_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_NC_WRITE_REQ,sum) + - name: TCP_TCC_NC_ATOMIC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum) + - name: TCP_TCC_UC_READ_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_UC_READ_REQ,sum) + - name: TCP_TCC_UC_WRITE_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_UC_WRITE_REQ,sum) + - name: TCP_TCC_UC_ATOMIC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum) + - name: TCP_TCC_CC_READ_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_CC_READ_REQ,sum) + - name: TCP_TCC_CC_WRITE_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_CC_WRITE_REQ,sum) + - name: TCP_TCC_CC_ATOMIC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum) + - name: TCP_TCC_RW_READ_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_RW_READ_REQ,sum) + - name: TCP_TCC_RW_WRITE_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_RW_WRITE_REQ,sum) + - name: TCP_TCC_RW_ATOMIC_REQ_sum + description: . + properties: [] + definitions: + - architectures: + - gfx908 + expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index d15b14a5e7..fdd9e069bd 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -47,13 +47,11 @@ from utils.mi_gpu_spec import mi_gpu_specs from utils.parser import build_in_vars, supported_denom from utils.utils import ( add_counter_extra_config_input_yaml, - add_counter_from_source_to_target_extra_config_input_yaml, capture_subprocess_output, convert_metric_id_to_panel_idx, detect_rocprof, get_base_spi_pipe_counter, get_submodules, - is_counter_existed_in_extra_input_yaml, is_spi_pipe_counter, is_tcc_channel_counter, using_v3, @@ -495,6 +493,18 @@ class OmniSoC_Base: if "Name:" in line: counters, _ = self.parse_counters_text(line.split(":")[1].strip()) rocprof_counters.update(counters) + # Custom counter support for mi100 for rocprofv3 + if self._mspec.gpu_model.lower() == "mi100": + counter_defs_path = ( + config.rocprof_compute_home + / "rocprof_compute_soc" + / "profile_configs" + / "gfx908_counter_defs.yaml" + ) + with open(counter_defs_path, "r") as fp: + counter_defs_contents = fp.read() + counters, _ = self.parse_counters_text(counter_defs_contents) + rocprof_counters.update(counters) elif str(rocprof_cmd) == "rocprofiler-sdk": MAX_STR = 256 @@ -556,6 +566,18 @@ class OmniSoC_Base: rocprof_counters.add( ctypes.cast(name_args, ctypes.c_char_p).value.decode("utf-8") ) + # Custom counter support for mi100 for rocprofiler-sdk + if self._mspec.gpu_model.lower() == "mi100": + counter_defs_path = ( + config.rocprof_compute_home + / "rocprof_compute_soc" + / "profile_configs" + / "gfx908_counter_defs.yaml" + ) + with open(counter_defs_path, "r") as fp: + counter_defs_contents = fp.read() + counters, _ = self.parse_counters_text(counter_defs_contents) + rocprof_counters.update(counters) else: console_error( @@ -750,18 +772,6 @@ class OmniSoC_Base: else: # Output to files - with open( - str( - Path(config.rocprof_compute_home).joinpath( - "rocprof_compute_soc", - "profile_configs", - "accum_counters.yaml", - ) - ), - "r", - ) as fp: - accum_counters_def = yaml.safe_load(fp) - for f in output_files: file_name_txt = str(Path(workload_perfmon_dir).joinpath(f.file_name_txt)) file_name_yaml = str( @@ -777,16 +787,49 @@ class OmniSoC_Base: ]: pmc.append(ctr) if using_v3(): - if is_counter_existed_in_extra_input_yaml( - accum_counters_def, ctr - ) and not is_counter_existed_in_extra_input_yaml( - counter_def, ctr - ): - counter_def = ( - add_counter_from_source_to_target_extra_config_input_yaml( - accum_counters_def, counter_def, ctr + # MI 100 accumulate counters dont work with rocprofiler sdk + if self._mspec.gpu_model.lower() != "mi100": + # Add accumulation counters definitions + if ctr == "SQ_IFETCH_LEVEL": + counter_def = add_counter_extra_config_input_yaml( + counter_def, + "SQ_IFETCH_LEVEL_ACCUM", + "SQ_IFETCH_LEVEL accumulation", + "accumulate(SQ_IFETCH_LEVEL, HIGH_RES)", + [self.__arch], + ) + elif ctr == "SQ_INST_LEVEL_LDS": + counter_def = add_counter_extra_config_input_yaml( + counter_def, + "SQ_INST_LEVEL_LDS_ACCUM", + "SQ_INST_LEVEL_LDS accumulation", + "accumulate(SQ_INST_LEVEL_LDS, HIGH_RES)", + [self.__arch], + ) + elif ctr == "SQ_INST_LEVEL_SMEM": + counter_def = add_counter_extra_config_input_yaml( + counter_def, + "SQ_INST_LEVEL_SMEM_ACCUM", + "SQ_INST_LEVEL_SMEM accumulation", + "accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES)", + [self.__arch], + ) + elif ctr == "SQ_INST_LEVEL_VMEM": + counter_def = add_counter_extra_config_input_yaml( + counter_def, + "SQ_INST_LEVEL_VMEM_ACCUM", + "SQ_INST_LEVEL_VMEM accumulation", + "accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)", + [self.__arch], + ) + elif ctr == "SQ_LEVEL_WAVES": + counter_def = add_counter_extra_config_input_yaml( + counter_def, + "SQ_LEVEL_WAVES_ACCUM", + "SQ_LEVEL_WAVES accumulation", + "accumulate(SQ_LEVEL_WAVES, HIGH_RES)", + [self.__arch], ) - ) # Add TCC channel counters definitions if is_tcc_channel_counter(ctr): counter_name = ctr.split("[")[0] @@ -813,10 +856,9 @@ class OmniSoC_Base: fd.close() # Write counter definitions to file - if using_v3(): + if counter_def: with open(file_name_yaml, "w") as fp: - if counter_def: - fp.write(yaml.dump(counter_def, sort_keys=False)) + fp.write(yaml.dump(counter_def, sort_keys=False)) # Add a timestamp file # TODO: Does v3 need this? diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_gfx908.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_gfx908.py index bcaac75bd3..4263e5f778 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_gfx908.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_gfx908.py @@ -34,7 +34,7 @@ class gfx908_soc(OmniSoC_Base): def __init__(self, args, mspec): super().__init__(args, mspec) self.set_arch("gfx908") - self.set_compatible_profilers(["rocprofv1"]) + self.set_compatible_profilers(["rocprofv1", "rocprofv3", "rocprofiler-sdk"]) # Per IP block max number of simultaneous counters. GFX IP Blocks self.set_perfmon_config(mi_gpu_specs.get_perfmon_config("gfx908")) diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py index adf34c0ffd..2118ff4749 100644 --- a/projects/rocprofiler-compute/src/utils/utils.py +++ b/projects/rocprofiler-compute/src/utils/utils.py @@ -62,21 +62,6 @@ def is_tcc_channel_counter(counter): return counter.startswith("TCC") and counter.endswith("]") -def is_counter_existed_in_extra_input_yaml(data: dict, counter_name: str) -> bool: - """ - Check if a counter with the given name exists in the rocprofiler-sdk counters. - - Args: - data (dict): The loaded YAML dictionary. - counter_name (str): The name of the counter to check. - - Returns: - bool: True if the counter exists, False otherwise. - """ - counters = data.get("rocprofiler-sdk", {}).get("counters", []) - return any(counter.get("name") == counter_name for counter in counters) - - def add_counter_extra_config_input_yaml( data: dict, counter_name: str, @@ -172,46 +157,6 @@ def extract_counter_info_extra_config_input_yaml( return None -def add_counter_from_source_to_target_extra_config_input_yaml( - source_data: dict, target_data: dict, counter_name: str -) -> dict: - """ - Check if counter_name exists in source_data, and if yes, add it to target_data. - - Args: - source_data (dict): Source YAML dictionary to extract from. - target_data (dict): Target YAML dictionary to add to. - counter_name (str): Name of the counter to copy. - - Returns: - dict: Updated target_data dictionary. - """ - counter = extract_counter_info_extra_config_input_yaml(source_data, counter_name) - if not counter: - raise ValueError(f"Counter '{counter_name}' not found in source data") - - # Extract required info - name = counter.get("name") - description = counter.get("description", "") - properties = counter.get("properties", []) - definitions = counter.get("definitions", []) - - if not definitions: - raise ValueError(f"Counter '{counter_name}' has no definitions") - - architectures = definitions[0].get("architectures", []) - expression = definitions[0].get("expression", "") - - return add_counter_extra_config_input_yaml( - target_data, - counter_name=name, - description=description, - expression=expression, - architectures=architectures, - properties=properties, - ) - - def is_spi_pipe_counter(counter): for pattern in spi_pipe_counter_regexs: if re.match(pattern, counter): @@ -806,57 +751,66 @@ def run_prof( else: options = ["-A", "absolute"] + options - new_env = None + new_env = os.environ.copy() - path_counter_config_yaml = path(fname).with_suffix(".yaml") - if using_v3() and path_counter_config_yaml.exists(): + if using_v3(): + # Default counter definitions + if rocprof_cmd == "rocprofiler-sdk": + counter_defs_path = ( + path(options["ROCP_TOOL_LIBRARIES"]) + .resolve() + .parent.parent.parent.joinpath( + "share", "rocprofiler-sdk", "counter_defs.yaml" + ) + ) + else: + counter_defs_path = ( + path(shutil.which(rocprof_cmd)) + .resolve() + .parent.parent.joinpath("share", "rocprofiler-sdk", "counter_defs.yaml") + ) + # Custom counter definitions for MI 100 + if mspec.gpu_model.lower() == "mi100": + counter_defs_path = ( + config.rocprof_compute_home + / "rocprof_compute_soc" + / "profile_configs" + / "gfx908_counter_defs.yaml" + ) + # Read counter definitions + with open(counter_defs_path, "r") as file: + counter_defs = yaml.safe_load(file) # Get extra counter definitions - with open(path_counter_config_yaml, "r") as file: - extra_counter_defs = yaml.safe_load(file) - if extra_counter_defs: - # Get default counter definitions path - if rocprof_cmd == "rocprofiler-sdk": - counter_defs_path = ( - path(options["ROCP_TOOL_LIBRARIES"]) - .resolve() - .parent.parent.parent.joinpath( - "share", "rocprofiler-sdk", "counter_defs.yaml" - ) - ) - else: - counter_defs_path = ( - path(shutil.which(rocprof_cmd)) - .resolve() - .parent.parent.joinpath( - "share", "rocprofiler-sdk", "counter_defs.yaml" - ) - ) - # Get default counter definitions - with open(counter_defs_path, "r") as file: - counter_defs = yaml.safe_load(file) - # Merge counter definitions + path_counter_config_yaml = path(fname).with_suffix(".yaml") + if path_counter_config_yaml.exists(): + with open(path_counter_config_yaml, "r") as file: + extra_counter_defs = yaml.safe_load(file) + # Merge extra counter definitions counter_defs["rocprofiler-sdk"]["counters"].extend( extra_counter_defs["rocprofiler-sdk"]["counters"] ) - # Write merged counter definitions to a temporary file - tmp_dir = tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp") - tmpfile_path = path(tmp_dir) / "counter_defs.yaml" - with open(tmpfile_path, "w") as tmpfile: - yaml.dump( - counter_defs, tmpfile, default_flow_style=False, sort_keys=False - ) - # Set the environment variable to point to the temporary file - if not new_env: - new_env = os.environ.copy() - new_env["ROCPROFILER_METRICS_PATH"] = str(path(tmp_dir)) - console_debug( - f"Adding env var for extra counters: ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}" - ) + # Write counter definitions to a temporary file + tmpfile_path = ( + path(tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp")) + / "counter_defs.yaml" + ) + with open(tmpfile_path, "w") as tmpfile: + yaml.dump(counter_defs, tmpfile, default_flow_style=False, sort_keys=False) + # Set rocprofiler sdk counter definitions + new_env["ROCPROFILER_METRICS_PATH"] = str(tmpfile_path.parent) + console_debug( + f"Adding env var for counter definitions: ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}" + ) - # set required env var for mi300 - if mspec.gpu_model.lower() not in ("mi50", "mi60", "mi210", "mi250", "mi250x"): - if not new_env: - new_env = os.environ.copy() + # set required env var for >= mi300 + if mspec.gpu_model.lower() not in ( + "mi50", + "mi60", + "mi100", + "mi210", + "mi250", + "mi250x", + ): new_env["ROCPROFILER_INDIVIDUAL_XCC_MODE"] = "1" is_timestamps = False @@ -866,8 +820,6 @@ def run_prof( if rocprof_cmd == "rocprofiler-sdk": app_cmd = options.pop("APP_CMD") - if not new_env: - new_env = os.environ.copy() for key, value in options.items(): new_env[key] = value console_debug("rocprof sdk env vars: {}".format(new_env)) @@ -878,14 +830,9 @@ def run_prof( else: console_debug("rocprof command: {}".format([rocprof_cmd] + options)) # profile the app - if new_env: - success, output = capture_subprocess_output( - [rocprof_cmd] + options, new_env=new_env, profileMode=True - ) - else: - success, output = capture_subprocess_output( - [rocprof_cmd] + options, profileMode=True - ) + success, output = capture_subprocess_output( + [rocprof_cmd] + options, new_env=new_env, profileMode=True + ) time_2 = time.time() console_debug( @@ -894,8 +841,8 @@ def run_prof( ) ) - # Delete temporary files - if new_env and "ROCPROFILER_METRICS_PATH" in new_env: + # Delete counter definition temporary directory + if new_env.get("ROCPROFILER_METRICS_PATH"): shutil.rmtree(new_env["ROCPROFILER_METRICS_PATH"], ignore_errors=True) if not success: @@ -959,7 +906,7 @@ def run_prof( workload_dir + "/out/pmc_1/results_" + fbase + ".csv", index=False ) - if new_env and not using_v3() and not using_v1(): + if not using_v3() and not using_v1(): # flatten tcc for applicable mi300 input f = path(workload_dir + "/out/pmc_1/results_" + fbase + ".csv") xcds = mi_gpu_specs.get_num_xcds(