SWDEV-458392 - Add gfx12 support.

Change-Id: I91bb6a3329bf77f26005a345c18b63b86922028a
Этот коммит содержится в:
jatang
2024-05-07 15:40:56 -04:00
коммит произвёл Jason Tang
родитель 4d0fee6637
Коммит e7b96b1e71
10 изменённых файлов: 199 добавлений и 11 удалений
+1 -1
Просмотреть файл
@@ -75,7 +75,7 @@ if [ -n "$ROCM_RPATH" ] ; then LD_RUNPATH_FLAG=" -Wl,--enable-new-dtags -Wl,--rp
if [ -z "$TO_CLEAN" ] ; then TO_CLEAN=yes; fi
if [ -z "$RUN_TEST" ] ; then RUN_TEST=no; fi
if [ -z "$ASAN" ] ; then ASAN=False; fi
if [ -z "$GPU_LIST" ] ; then GPU_LIST="gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1031"; fi
if [ -z "$GPU_LIST" ] ; then GPU_LIST="gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1031 gfx1200 gfx1201"; fi
ROCPROFILER_ROOT=$(cd $ROCPROFILER_ROOT && echo $PWD)
+26 -1
Просмотреть файл
@@ -462,4 +462,29 @@
<metric name="LDSBankConflict" expr="100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE" descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
</gfx11>
</metrics>
<gfx12>
<metric name="GPU_UTIL" expr="100*GRBM_GUI_ACTIVE/GRBM_COUNT" descr="Percentage of the time that GUI is active"></metric>
<metric name="WAVE_DEP_WAIT" expr="100*SQ_WAIT_ANY/SQ_WAVE_CYCLES" descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
<metric name="WAVE_ISSUE_WAIT" expr="100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES" descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
<metric name="TA_BUSY_avr" expr="avr(TA_TA_BUSY,16)" descr="TA block is busy. Average over TA instances."></metric>
<metric name="TA_BUSY_max" expr="max(TA_TA_BUSY,16)" descr="TA block is busy. Max over TA instances."></metric>
<metric name="TA_BUSY_min" expr="min(TA_TA_BUSY,16)" descr="TA block is busy. Min over TA instances."></metric>
<metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr="sum(TA_BUFFER_LOAD_WAVEFRONTS,16)" descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
<metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr="sum(TA_BUFFER_STORE_WAVEFRONTS,16)" descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
<metric name="GL2C_HIT_sum" expr="sum(GL2C_HIT,16)" descr="Number of cache hits. Sum over GL2C instances."></metric>
<metric name="GL2C_MISS_sum" expr="sum(GL2C_MISS,16)" descr="Number of cache misses. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_32B_sum" expr="sum(GL2C_EA_RDREQ_32B,16)" descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_64B_sum" expr="sum(GL2C_EA_RDREQ_64B,16)" descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_96B_sum" expr="sum(GL2C_EA_RDREQ_96B,16)" descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_128B_sum" expr="sum(GL2C_EA_RDREQ_128B,16)" descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_MC_RDREQ_sum" expr="sum(GL2C_MC_RDREQ,16)" descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
<metric name="GL2C_MC_WRREQ_sum" expr="sum(GL2C_MC_WRREQ,16)" descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_WRREQ_64B_sum" expr="sum(GL2C_EA_WRREQ_64B,16)" descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
<metric name="GL2C_WRREQ_STALL_max" expr="max(GL2C_MC_WRREQ_STALL,16)" descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
<metric name="L2CacheHit" expr="100*sum(GL2C_HIT,16)/(sum(GL2C_HIT,16)+sum(GL2C_MISS,16))" descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
<metric name="FETCH_SIZE" expr="(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024" descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
<metric name="WriteUnitStalled" expr="100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE" descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
<metric name="LDSBankConflict" expr="100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE" descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
</gfx12>
</metrics>
+46
Просмотреть файл
@@ -737,3 +737,49 @@
<gfx1101 base="gfx11"></gfx1101>
<gfx1102 base="gfx11"></gfx1102>
<gfx12>
<metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
<metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
<metric name="GL2C_HIT" block=GL2C event=42 descr="Number of cache hits"></metric>
<metric name="GL2C_MISS" block=GL2C event=43 descr="Number of cache misses. UC reads count as misses."></metric>
<metric name="GL2C_MC_WRREQ" block=GL2C event=83 descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands"></metric>
<metric name="GL2C_EA_WRREQ_64B" block=GL2C event=85 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
<metric name="GL2C_MC_WRREQ_STALL" block=GL2C event=88 descr="Number of cycles a write request was stalled."></metric>
<metric name="GL2C_MC_RDREQ" block=GL2C event=96 descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte)."></metric>
<metric name="GL2C_EA_RDREQ_32B" block=GL2C event=99 descr="Number of 32-byte GL2C/EA read requests"></metric>
<metric name="GL2C_EA_RDREQ_64B" block=GL2C event=100 descr="Number of 64-byte GL2C/EA read requests"></metric>
<metric name="GL2C_EA_RDREQ_96B" block=GL2C event=101 descr="Number of 96-byte GL2C/EA read requests"></metric>
<metric name="GL2C_EA_RDREQ_128B" block=GL2C event=102 descr="Number of 128-byte GL2C/EA read requests"></metric>
<metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1."></metric>
<metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. {nondeterministic, global, C2}"></metric>
<metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. {emulated, global, C1}"></metric>
<metric name="SQ_WAVE_CYCLES" block=SQ event=24 descr="Number of clock cycles spent by waves in the SQs. Incremented by number of living (valid) waves each cycle. {nondeterministic, C1}"></metric>
<metric name="SQ_WAIT_INST_ANY" block=SQ event=26 descr="Number of clock-cycles spent waiting for any instruction issue. In units of cycles. (nondeterministic)"></metric>
<metric name="SQ_WAIT_ANY" block=SQ event=35 descr="Number of wave-cycles spent waiting for anything (nondeterministic, C1)"></metric>
<metric name="SQ_INSTS_WAVE32" block=SQ event=70 descr="Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}"></metric>
<metric name="SQ_INSTS_WAVE32_LDS" block=SQ event=72 descr="Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_WAVE32_VALU" block=SQ event=73 descr="Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
<metric name="SQ_WAVE32_INSTS" block=SQ event=82 descr="Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}"></metric>
<metric name="SQ_WAVE64_INSTS" block=SQ event=83 descr="Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}"></metric>
<metric name="SQ_INST_LEVEL_GDS" block=SQ event=87 descr="Number of in-flight GDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_GDS for average latency. {level, nondeterministic, C1}"></metric>
<metric name="SQ_INST_LEVEL_LDS" block=SQ event=88 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. {level, nondeterministic, C1}"></metric>
<metric name="SQ_INST_CYCLES_VMEM" block=SQ event=106 descr="Number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1}"></metric>
<metric name="SQC_LDS_BANK_CONFLICT" block=SQ event=256 descr="Number of cycles LDS is stalled by bank conflicts. (emulated, C1)"></metric>
<metric name="SQC_LDS_IDX_ACTIVE" block=SQ event=261 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, C1}"></metric>
<metric name="SQ_INSTS_VALU" block=SQ event=62 descr="Number of VALU instructions issued excluding skipped instructions. {emulated, C1}"></metric>
<metric name="SQ_INSTS_SALU" block=SQ event=58 descr="Number of SALU instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_SMEM" block=SQ event=59 descr="Number of SMEM instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_FLAT" block=SQ event=56 descr="Number of FLAT instructions issued. {emulated, C2}"></metric>
<metric name="SQ_INSTS_LDS" block=SQ event=57 descr="Number of LDS indexed instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_GDS" block=SQ event=54 descr="Number of GDS instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_TEX_LOAD" block=SQ event=66 descr="Number of buffer load, image load, sample, or atomic (with return) instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_TEX_STORE" block=SQ event=67 descr="Number of buffer store, image store, or atomic (without return) instructions issued. {emulated, C1}"></metric>
<metric name="SQ_WAIT_INST_LDS" block=SQ event=29 descr="Number of clock cycles spent waiting for LDS (indexed) instruction issue. In units of cycles. {nondeterministic, C1}"></metric>
<metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
<metric name="TA_BUFFER_LOAD_WAVEFRONTS" block=TA event=45 descr="Number of buffer load vec32 packets processed by TA"></metric>
<metric name="TA_BUFFER_STORE_WAVEFRONTS" block=TA event=46 descr="Number of buffer store vec32 packets processed by TA"></metric>
</gfx12>
<gfx1200 base="gfx12"></gfx1200>
<gfx1201 base="gfx12"></gfx1201>
+32
Просмотреть файл
@@ -555,6 +555,35 @@
<metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
</gfx11_expr>
<gfx12_expr>
<metric name="KERNEL_DURATION" expr=1 descr="The duration of the kernel dispatch"></metric>
<metric name="GPU_UTIL" expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT descr="Percentage of the time that GUI is active"></metric>
<metric name="WAVE_DEP_WAIT" expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
<metric name="WAVE_ISSUE_WAIT" expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
<metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
<metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
<metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
<metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr=sum(TA_BUFFER_LOAD_WAVEFRONTS,16) descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
<metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr=sum(TA_BUFFER_STORE_WAVEFRONTS,16) descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
<metric name="GL2C_HIT_sum" expr=sum(GL2C_HIT,32) descr="Number of cache hits. Sum over GL2C instances."></metric>
<metric name="GL2C_MISS_sum" expr=sum(GL2C_MISS,32) descr="Number of cache misses. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_32B_sum" expr=sum(GL2C_EA_RDREQ_32B,32) descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_64B_sum" expr=sum(GL2C_EA_RDREQ_64B,32) descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_96B_sum" expr=sum(GL2C_EA_RDREQ_96B,32) descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_128B_sum" expr=sum(GL2C_EA_RDREQ_128B,32) descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_MC_RDREQ_sum" expr=sum(GL2C_MC_RDREQ,32) descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
<metric name="GL2C_MC_WRREQ_sum" expr=sum(GL2C_MC_WRREQ,32) descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_WRREQ_64B_sum" expr=sum(GL2C_EA_WRREQ_64B,32) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
<metric name="GL2C_WRREQ_STALL_max" expr=max(GL2C_MC_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
<metric name="L2CacheHit" expr=100*sum(GL2C_HIT,32)/(sum(GL2C_HIT,32)+sum(GL2C_MISS,32)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
<metric name="FETCH_SIZE" expr=(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
<metric name="WriteUnitStalled" expr=100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
<metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
</gfx12_expr>
<gfx1200_expr base="gfx12_expr"></gfx1200_expr>
<gfx1201_expr base="gfx12_expr"></gfx1201_expr>
<gfx1100_expr base="gfx11_expr"></gfx1100_expr>
<gfx1101_expr base="gfx11_expr"></gfx1101_expr>
<gfx1102_expr base="gfx11_expr"></gfx1102_expr>
@@ -581,6 +610,9 @@
<gfx1100 base="gfx1100_expr"></gfx1100>
<gfx1101 base="gfx1101_expr"></gfx1101>
<gfx1102 base="gfx1102_expr"></gfx1102>
#Navi4x
<gfx1200 base="gfx1200_expr"></gfx1200>
<gfx1201 base="gfx1201_expr"></gfx1201>
<global>
+4 -3
Просмотреть файл
@@ -225,10 +225,11 @@ class MetricsDict {
agent_name_ = agent_name_.substr(0, agent_name_.find(':'));
std::unordered_set<std::string> supported_agent_names = {
"gfx906", "gfx908", "gfx90a", // Vega
"gfx940", "gfx941", "gfx942", // Mi300
"gfx906", "gfx908", "gfx90a", // Vega
"gfx940", "gfx941", "gfx942", // Mi300
"gfx1030", "gfx1031", "gfx1032", // Navi2x
"gfx1100", "gfx1101", "gfx1102" // Navi3x
"gfx1100", "gfx1101", "gfx1102", // Navi3x
"gfx1200", "gfx1201", // Navi4x
};
if (supported_agent_names.find(agent_name_) != supported_agent_names.end()) {
ImportMetrics(agent_info, agent_name_);
+2 -1
Просмотреть файл
@@ -211,7 +211,8 @@ class MetricsDict {
"gfx906", "gfx908", "gfx90a", // Vega
"gfx940", "gfx941", "gfx942", // Mi300
"gfx1030", "gfx1031", "gfx1032", // Navi2x
"gfx1100", "gfx1101", "gfx1102" // Navi3x
"gfx1100", "gfx1101", "gfx1102", // Navi3x
"gfx1200", "gfx1201", // Navi4x
};
if (supported_agent_names.find(agent_name_) != supported_agent_names.end()) {
ImportMetrics(agent_info, agent_name_);
+49
Просмотреть файл
@@ -741,3 +741,52 @@
<gfx1101 base="gfx11"></gfx1101>
<gfx1102 base="gfx11"></gfx1102>
<gfx12>
<metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
<metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
<metric name="GL2C_HIT" block=GL2C event=42 descr="Number of cache hits"></metric>
<metric name="GL2C_MISS" block=GL2C event=43 descr="Number of cache misses. UC reads count as misses."></metric>
<metric name="GL2C_MC_WRREQ" block=GL2C event=83 descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands"></metric>
<metric name="GL2C_EA_WRREQ_64B" block=GL2C event=85 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
<metric name="GL2C_MC_WRREQ_STALL" block=GL2C event=88 descr="Number of cycles a write request was stalled."></metric>
<metric name="GL2C_MC_RDREQ" block=GL2C event=96 descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte)."></metric>
<metric name="GL2C_EA_RDREQ_32B" block=GL2C event=99 descr="Number of 32-byte GL2C/EA read requests"></metric>
<metric name="GL2C_EA_RDREQ_64B" block=GL2C event=100 descr="Number of 64-byte GL2C/EA read requests"></metric>
<metric name="GL2C_EA_RDREQ_96B" block=GL2C event=101 descr="Number of 96-byte GL2C/EA read requests"></metric>
<metric name="GL2C_EA_RDREQ_128B" block=GL2C event=102 descr="Number of 128-byte GL2C/EA read requests"></metric>
<metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1."></metric>
<metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. {nondeterministic, global, C2}"></metric>
<metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. {emulated, global, C1}"></metric>
<metric name="SQ_WAVE_CYCLES" block=SQ event=24 descr="Number of clock cycles spent by waves in the SQs. Incremented by number of living (valid) waves each cycle. {nondeterministic, C1}"></metric>
<metric name="SQ_WAIT_INST_ANY" block=SQ event=26 descr="Number of clock-cycles spent waiting for any instruction issue. In units of cycles. (nondeterministic)"></metric>
<metric name="SQ_WAIT_ANY" block=SQ event=35 descr="Number of wave-cycles spent waiting for anything (nondeterministic, C1)"></metric>
<metric name="SQ_INSTS_WAVE32" block=SQ event=70 descr="Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}"></metric>
<metric name="SQ_INSTS_WAVE32_LDS" block=SQ event=72 descr="Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_WAVE32_VALU" block=SQ event=73 descr="Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
<metric name="SQ_WAVE32_INSTS" block=SQ event=82 descr="Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}"></metric>
<metric name="SQ_WAVE64_INSTS" block=SQ event=83 descr="Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}"></metric>
<metric name="SQ_INST_LEVEL_GDS" block=SQ event=87 descr="Number of in-flight GDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_GDS for average latency. {level, nondeterministic, C1}"></metric>
<metric name="SQ_INST_LEVEL_LDS" block=SQ event=88 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. {level, nondeterministic, C1}"></metric>
<metric name="SQ_INST_CYCLES_VMEM" block=SQ event=106 descr="Number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1}"></metric>
<metric name="SQC_LDS_BANK_CONFLICT" block=SQ event=256 descr="Number of cycles LDS is stalled by bank conflicts. (emulated, C1)"></metric>
<metric name="SQC_LDS_IDX_ACTIVE" block=SQ event=261 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, C1}"></metric>
<metric name="SQ_INSTS_VALU" block=SQ event=62 descr="Number of VALU instructions issued excluding skipped instructions. {emulated, C1}"></metric>
<metric name="SQ_INSTS_SALU" block=SQ event=58 descr="Number of SALU instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_SMEM" block=SQ event=59 descr="Number of SMEM instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_FLAT" block=SQ event=56 descr="Number of FLAT instructions issued. {emulated, C2}"></metric>
<metric name="SQ_INSTS_LDS" block=SQ event=57 descr="Number of LDS indexed instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_GDS" block=SQ event=54 descr="Number of GDS instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_TEX_LOAD" block=SQ event=66 descr="Number of buffer load, image load, sample, or atomic (with return) instructions issued. {emulated, C1}"></metric>
<metric name="SQ_INSTS_TEX_STORE" block=SQ event=67 descr="Number of buffer store, image store, or atomic (without return) instructions issued. {emulated, C1}"></metric>
<metric name="SQ_WAIT_INST_LDS" block=SQ event=29 descr="Number of clock cycles spent waiting for LDS (indexed) instruction issue. In units of cycles. {nondeterministic, C1}"></metric>
<metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
<metric name="TA_BUFFER_LOAD_WAVEFRONTS" block=TA event=45 descr="Number of buffer load vec32 packets processed by TA"></metric>
<metric name="TA_BUFFER_STORE_WAVEFRONTS" block=TA event=46 descr="Number of buffer store vec32 packets processed by TA"></metric>
</gfx12>
<gfx1200 base="gfx12"></gfx1200>
<gfx1201 base="gfx12"></gfx1201>
+33
Просмотреть файл
@@ -422,6 +422,36 @@
<metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
</gfx11_expr>
<gfx12_expr>
<metric name="GPU_UTIL" expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT descr="Percentage of the time that GUI is active"></metric>
<metric name="WAVE_DEP_WAIT" expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
<metric name="WAVE_ISSUE_WAIT" expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
<metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
<metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
<metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
<metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr=sum(TA_BUFFER_LOAD_WAVEFRONTS,16) descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
<metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr=sum(TA_BUFFER_STORE_WAVEFRONTS,16) descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
<metric name="GL2C_HIT_sum" expr=sum(GL2C_HIT,16) descr="Number of cache hits. Sum over GL2C instances."></metric>
<metric name="GL2C_MISS_sum" expr=sum(GL2C_MISS,16) descr="Number of cache misses. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_32B_sum" expr=sum(GL2C_EA_RDREQ_32B,16) descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_64B_sum" expr=sum(GL2C_EA_RDREQ_64B,16) descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_96B_sum" expr=sum(GL2C_EA_RDREQ_96B,16) descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_RDREQ_128B_sum" expr=sum(GL2C_EA_RDREQ_128B,16) descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
<metric name="GL2C_MC_RDREQ_sum" expr=sum(GL2C_MC_RDREQ,16) descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
<metric name="GL2C_MC_WRREQ_sum" expr=sum(GL2C_MC_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
<metric name="GL2C_EA_WRREQ_64B_sum" expr=sum(GL2C_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
<metric name="GL2C_WRREQ_STALL_max" expr=max(GL2C_MC_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
<metric name="L2CacheHit" expr=100*sum(GL2C_HIT,16)/(sum(GL2C_HIT,16)+sum(GL2C_MISS,16)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
<metric name="FETCH_SIZE" expr=(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
<metric name="WriteUnitStalled" expr=100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
<metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
</gfx12_expr>
<gfx1200_expr base="gfx12_expr"></gfx1200_expr>
<gfx1201_expr base="gfx12_expr"></gfx1201_expr>
<gfx1100_expr base="gfx11_expr"></gfx1100_expr>
<gfx1101_expr base="gfx11_expr"></gfx1101_expr>
<gfx1102_expr base="gfx11_expr"></gfx1102_expr>
@@ -459,6 +489,9 @@
<gfx1101 base="gfx1101_expr"></gfx1101>
<gfx1102 base="gfx1102_expr"></gfx1102>
<gfx1200 base="gfx1200_expr"></gfx1200>
<gfx1201 base="gfx1201_expr"></gfx1201>
<global>
# GPUBusy The percentage of time GPU was busy.
<metric
+1 -1
Просмотреть файл
@@ -308,7 +308,7 @@ function(generate_hsaco TARGET_ID INPUT_FILE OUTPUT_FILE)
PARENT_SCOPE)
endfunction(generate_hsaco)
set(GPU_LIST "gfx900" "gfx906" "gfx908" "gfx90a" "gfx942" "gfx1030" "gfx1031" "gfx1032" "gfx1100" "gfx1101" "gfx1102")
set(GPU_LIST "gfx900" "gfx906" "gfx908" "gfx90a" "gfx942" "gfx1030" "gfx1031" "gfx1032" "gfx1100" "gfx1101" "gfx1102" "gfx1200" "gfx1201")
foreach(target_id ${GPU_LIST})
# generate kernel bitcodes
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/apps/copy.cl
+5 -4
Просмотреть файл
@@ -45,7 +45,7 @@ std::string test_app_path;
std::string metrics_path;
std::string binary_path;
std::string profiler_api_lib_path = "";
bool bContainsGFX11Agent = false;
bool bSkipCounterNoneZeroCheck = false;
static void init_test_path() {
lib_path = "lib/rocprofiler/librocprofiler_tool.so";
@@ -519,8 +519,9 @@ TEST_F(LoadUnloadTest, WhenLoadingFirstTimeThenToolLoadsUnloadsSuccessfully) {
std::string agentname;
agentname.resize(64);
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agentname.data());
if (agentname.find("gfx11") != std::string::npos)
bContainsGFX11Agent = true;
if ((agentname.find("gfx11") != std::string::npos) ||
(agentname.find("gfx12") != std::string::npos))
bSkipCounterNoneZeroCheck = true;
hsa_device_type_t type;
return hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
@@ -896,7 +897,7 @@ class ProfilerAPITest : public ::testing::Test {
CheckApi(rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, profiler_record->kernel_id,
&kernel_name_c));
if (profiler_record->counters && !bContainsGFX11Agent)
if (profiler_record->counters && !bSkipCounterNoneZeroCheck)
for (uint64_t i = 0; i < profiler_record->counters_count.value; i++)
if (profiler_record->counters[i].counter_handler.handle > 0)
EXPECT_NE(profiler_record->counters[i].value.value, 0);