[rocprofiler][navi4] Remove navi4x support on rocprofv2. (#307)
* Remove navi4x support on rocprofv2. * remove gfx12 from build scripts. * bug fix. * address comments. * update changelog * Update CHANGELOG.md * Update CHANGELOG.md * Update CHANGELOG.md * address comments Co-authored-by: Swati Rawat <120587655+SwRaw@users.noreply.github.com> --------- Co-authored-by: Venkateshwar Reddy Kandula <venkateshwar.kandula1306@gmail.com> Co-authored-by: Swati Rawat <120587655+SwRaw@users.noreply.github.com>
Esse commit está contido em:
commit de
GitHub
pai
25922d08c3
commit
997b36f5bc
@@ -393,3 +393,14 @@ Example for file plugin output:
|
||||
### Resolved issues
|
||||
|
||||
- Fixed issue where invalid UTF-8 characters in a trace log would crash the program
|
||||
|
||||
## ROCProfiler for ROCm 7.1
|
||||
|
||||
### Added
|
||||
|
||||
### Resolved issues
|
||||
|
||||
### Removed
|
||||
|
||||
- `rocprofv2` doesn't support gfx12. For gfx12, use `rocprofv3` tool.
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ if [ -n "$ROCM_RPATH" ] ; then LD_RUNPATH_FLAG=" -Wl,--enable-new-dtags -Wl,--rp
|
||||
if [ -z "$TO_CLEAN" ] ; then TO_CLEAN=yes; fi
|
||||
if [ -z "$RUN_TEST" ] ; then RUN_TEST=no; fi
|
||||
if [ -z "$ASAN" ] ; then ASAN=False; fi
|
||||
if [ -z "$GPU_LIST" ] ; then GPU_LIST="gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1031 gfx1150 gfx1151 gfx1200 gfx1201"; fi
|
||||
if [ -z "$GPU_LIST" ] ; then GPU_LIST="gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1031 gfx1150 gfx1151"; fi
|
||||
|
||||
ROCPROFILER_ROOT=$(cd $ROCPROFILER_ROOT && echo $PWD)
|
||||
|
||||
|
||||
@@ -474,29 +474,4 @@
|
||||
<metric name="OccupancyPercent" expr="100*SQ_WAVE_CYCLES/GRBM_GUI_ACTIVE/CU_NUM/32" descr="GPU occupancy as % of maximum."></metric>
|
||||
</gfx11>
|
||||
|
||||
<gfx12>
|
||||
<metric name="GPU_UTIL" expr="100*GRBM_GUI_ACTIVE/GRBM_COUNT" descr="Percentage of the time that GUI is active"></metric>
|
||||
<metric name="WAVE_DEP_WAIT" expr="100*SQ_WAIT_ANY/SQ_WAVE_CYCLES" descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
|
||||
<metric name="WAVE_ISSUE_WAIT" expr="100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES" descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
|
||||
<metric name="TA_BUSY_avr" expr="avr(TA_TA_BUSY,16)" descr="TA block is busy. Average over TA instances."></metric>
|
||||
<metric name="TA_BUSY_max" expr="max(TA_TA_BUSY,16)" descr="TA block is busy. Max over TA instances."></metric>
|
||||
<metric name="TA_BUSY_min" expr="min(TA_TA_BUSY,16)" descr="TA block is busy. Min over TA instances."></metric>
|
||||
<metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr="sum(TA_BUFFER_LOAD_WAVEFRONTS,16)" descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
|
||||
<metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr="sum(TA_BUFFER_STORE_WAVEFRONTS,16)" descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
|
||||
<metric name="GL2C_HIT_sum" expr="sum(GL2C_HIT,16)" descr="Number of cache hits. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MISS_sum" expr="sum(GL2C_MISS,16)" descr="Number of cache misses. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_32B_sum" expr="sum(GL2C_EA_RDREQ_32B,16)" descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_64B_sum" expr="sum(GL2C_EA_RDREQ_64B,16)" descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_96B_sum" expr="sum(GL2C_EA_RDREQ_96B,16)" descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_128B_sum" expr="sum(GL2C_EA_RDREQ_128B,16)" descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MC_RDREQ_sum" expr="sum(GL2C_MC_RDREQ,16)" descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MC_WRREQ_sum" expr="sum(GL2C_MC_WRREQ,16)" descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_WRREQ_64B_sum" expr="sum(GL2C_EA_WRREQ_64B,16)" descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_WRREQ_STALL_max" expr="max(GL2C_MC_WRREQ_STALL,16)" descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
|
||||
<metric name="L2CacheHit" expr="100*sum(GL2C_HIT,16)/(sum(GL2C_HIT,16)+sum(GL2C_MISS,16))" descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
|
||||
<metric name="FETCH_SIZE" expr="(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024" descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
|
||||
<metric name="WriteUnitStalled" expr="100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE" descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
|
||||
<metric name="LDSBankConflict" expr="100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE" descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
|
||||
</gfx12>
|
||||
|
||||
</metrics>
|
||||
|
||||
@@ -741,49 +741,3 @@
|
||||
<gfx1150 base="gfx11"></gfx1150>
|
||||
<gfx1151 base="gfx11"></gfx1151>
|
||||
|
||||
<gfx12>
|
||||
<metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
|
||||
<metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
|
||||
<metric name="GL2C_HIT" block=GL2C event=42 descr="Number of cache hits"></metric>
|
||||
<metric name="GL2C_MISS" block=GL2C event=43 descr="Number of cache misses. UC reads count as misses."></metric>
|
||||
<metric name="GL2C_MC_WRREQ" block=GL2C event=83 descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands"></metric>
|
||||
<metric name="GL2C_EA_WRREQ_64B" block=GL2C event=85 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
|
||||
<metric name="GL2C_MC_WRREQ_STALL" block=GL2C event=88 descr="Number of cycles a write request was stalled."></metric>
|
||||
<metric name="GL2C_MC_RDREQ" block=GL2C event=96 descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte)."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_32B" block=GL2C event=99 descr="Number of 32-byte GL2C/EA read requests"></metric>
|
||||
<metric name="GL2C_EA_RDREQ_64B" block=GL2C event=100 descr="Number of 64-byte GL2C/EA read requests"></metric>
|
||||
<metric name="GL2C_EA_RDREQ_96B" block=GL2C event=101 descr="Number of 96-byte GL2C/EA read requests"></metric>
|
||||
<metric name="GL2C_EA_RDREQ_128B" block=GL2C event=102 descr="Number of 128-byte GL2C/EA read requests"></metric>
|
||||
<metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1."></metric>
|
||||
<metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. {nondeterministic, global, C2}"></metric>
|
||||
<metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. {emulated, global, C1}"></metric>
|
||||
<metric name="SQ_WAVE_CYCLES" block=SQ event=24 descr="Number of clock cycles spent by waves in the SQs. Incremented by number of living (valid) waves each cycle. {nondeterministic, C1}"></metric>
|
||||
<metric name="SQ_WAIT_INST_ANY" block=SQ event=26 descr="Number of clock-cycles spent waiting for any instruction issue. In units of cycles. (nondeterministic)"></metric>
|
||||
<metric name="SQ_WAIT_ANY" block=SQ event=35 descr="Number of wave-cycles spent waiting for anything (nondeterministic, C1)"></metric>
|
||||
<metric name="SQ_INSTS_WAVE32" block=SQ event=70 descr="Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_WAVE32_LDS" block=SQ event=72 descr="Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_WAVE32_VALU" block=SQ event=73 descr="Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_WAVE32_INSTS" block=SQ event=82 descr="Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}"></metric>
|
||||
<metric name="SQ_WAVE64_INSTS" block=SQ event=83 descr="Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}"></metric>
|
||||
<metric name="SQ_INST_LEVEL_GDS" block=SQ event=87 descr="Number of in-flight GDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_GDS for average latency. {level, nondeterministic, C1}"></metric>
|
||||
<metric name="SQ_INST_LEVEL_LDS" block=SQ event=88 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. {level, nondeterministic, C1}"></metric>
|
||||
<metric name="SQ_INST_CYCLES_VMEM" block=SQ event=106 descr="Number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1}"></metric>
|
||||
<metric name="SQC_LDS_BANK_CONFLICT" block=SQ event=256 descr="Number of cycles LDS is stalled by bank conflicts. (emulated, C1)"></metric>
|
||||
<metric name="SQC_LDS_IDX_ACTIVE" block=SQ event=261 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_VALU" block=SQ event=62 descr="Number of VALU instructions issued excluding skipped instructions. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_SALU" block=SQ event=58 descr="Number of SALU instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_SMEM" block=SQ event=59 descr="Number of SMEM instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_FLAT" block=SQ event=56 descr="Number of FLAT instructions issued. {emulated, C2}"></metric>
|
||||
<metric name="SQ_INSTS_LDS" block=SQ event=57 descr="Number of LDS indexed instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_GDS" block=SQ event=54 descr="Number of GDS instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_TEX_LOAD" block=SQ event=66 descr="Number of buffer load, image load, sample, or atomic (with return) instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_TEX_STORE" block=SQ event=67 descr="Number of buffer store, image store, or atomic (without return) instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_WAIT_INST_LDS" block=SQ event=29 descr="Number of clock cycles spent waiting for LDS (indexed) instruction issue. In units of cycles. {nondeterministic, C1}"></metric>
|
||||
<metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
|
||||
<metric name="TA_BUFFER_LOAD_WAVEFRONTS" block=TA event=45 descr="Number of buffer load vec32 packets processed by TA"></metric>
|
||||
<metric name="TA_BUFFER_STORE_WAVEFRONTS" block=TA event=46 descr="Number of buffer store vec32 packets processed by TA"></metric>
|
||||
</gfx12>
|
||||
|
||||
<gfx1200 base="gfx12"></gfx1200>
|
||||
<gfx1201 base="gfx12"></gfx1201>
|
||||
|
||||
|
||||
@@ -569,35 +569,6 @@
|
||||
<metric name="OccupancyPercent" expr=100*SQ_WAVE_CYCLES/GRBM_GUI_ACTIVE/CU_NUM/32 descr="GPU occupancy as % of maximum."></metric>
|
||||
</gfx11_expr>
|
||||
|
||||
<gfx12_expr>
|
||||
<metric name="KERNEL_DURATION" expr=1 descr="The duration of the kernel dispatch"></metric>
|
||||
<metric name="GPU_UTIL" expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT descr="Percentage of the time that GUI is active"></metric>
|
||||
<metric name="WAVE_DEP_WAIT" expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
|
||||
<metric name="WAVE_ISSUE_WAIT" expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
|
||||
<metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
|
||||
<metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
|
||||
<metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
|
||||
<metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr=sum(TA_BUFFER_LOAD_WAVEFRONTS,16) descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
|
||||
<metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr=sum(TA_BUFFER_STORE_WAVEFRONTS,16) descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
|
||||
<metric name="GL2C_HIT_sum" expr=sum(GL2C_HIT,32) descr="Number of cache hits. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MISS_sum" expr=sum(GL2C_MISS,32) descr="Number of cache misses. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_32B_sum" expr=sum(GL2C_EA_RDREQ_32B,32) descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_64B_sum" expr=sum(GL2C_EA_RDREQ_64B,32) descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_96B_sum" expr=sum(GL2C_EA_RDREQ_96B,32) descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_128B_sum" expr=sum(GL2C_EA_RDREQ_128B,32) descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MC_RDREQ_sum" expr=sum(GL2C_MC_RDREQ,32) descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MC_WRREQ_sum" expr=sum(GL2C_MC_WRREQ,32) descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_WRREQ_64B_sum" expr=sum(GL2C_EA_WRREQ_64B,32) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_WRREQ_STALL_max" expr=max(GL2C_MC_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
|
||||
<metric name="L2CacheHit" expr=100*sum(GL2C_HIT,32)/(sum(GL2C_HIT,32)+sum(GL2C_MISS,32)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
|
||||
<metric name="FETCH_SIZE" expr=(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
|
||||
<metric name="WriteUnitStalled" expr=100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
|
||||
<metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
|
||||
</gfx12_expr>
|
||||
|
||||
<gfx1200_expr base="gfx12_expr"></gfx1200_expr>
|
||||
<gfx1201_expr base="gfx12_expr"></gfx1201_expr>
|
||||
|
||||
<gfx1100_expr base="gfx11_expr"></gfx1100_expr>
|
||||
<gfx1101_expr base="gfx11_expr"></gfx1101_expr>
|
||||
<gfx1102_expr base="gfx11_expr"></gfx1102_expr>
|
||||
@@ -628,9 +599,6 @@
|
||||
<gfx1102 base="gfx1102_expr"></gfx1102>
|
||||
<gfx1150 base="gfx1150_expr"></gfx1150>
|
||||
<gfx1151 base="gfx1151_expr"></gfx1151>
|
||||
#Navi4x
|
||||
<gfx1200 base="gfx1200_expr"></gfx1200>
|
||||
<gfx1201 base="gfx1201_expr"></gfx1201>
|
||||
|
||||
|
||||
<global>
|
||||
|
||||
@@ -230,7 +230,6 @@ class MetricsDict {
|
||||
"gfx1030", "gfx1031", "gfx1032", // Navi2x
|
||||
"gfx1100", "gfx1101", "gfx1102", // Navi3x
|
||||
"gfx1150", "gfx1151",
|
||||
"gfx1200", "gfx1201", // Navi4x
|
||||
};
|
||||
if (supported_agent_names.find(agent_name_) != supported_agent_names.end()) {
|
||||
ImportMetrics(agent_info, agent_name_);
|
||||
|
||||
@@ -121,6 +121,9 @@ DeviceInfo::DeviceInfo(uint32_t topology_id, uint32_t gpu_id) {
|
||||
minor_version_str = std::string(1, convert(minor_version));
|
||||
else
|
||||
minor_version_str = std::to_string(minor_version);
|
||||
if (major_version > 11 || (major_version == 9 && minor_version > 4))
|
||||
rocprofiler::fatal(
|
||||
"Unsupported hardware. Use rocprofv3 tool for navi4x, mi35x and later.");
|
||||
stepping = (gfx_target_version % 100);
|
||||
if (stepping > 9)
|
||||
stepping_str = std::string(1, convert(stepping));
|
||||
|
||||
@@ -213,7 +213,6 @@ class MetricsDict {
|
||||
"gfx1030", "gfx1031", "gfx1032", // Navi2x
|
||||
"gfx1100", "gfx1101", "gfx1102", // Navi3x
|
||||
"gfx1150", "gfx1151",
|
||||
"gfx1200", "gfx1201", // Navi4x
|
||||
};
|
||||
if (supported_agent_names.find(agent_name_) != supported_agent_names.end()) {
|
||||
ImportMetrics(agent_info, agent_name_);
|
||||
|
||||
@@ -745,52 +745,3 @@
|
||||
<gfx1150 base="gfx11"></gfx1150>
|
||||
<gfx1151 base="gfx11"></gfx1151>
|
||||
|
||||
<gfx12>
|
||||
<metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
|
||||
<metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
|
||||
|
||||
<metric name="GL2C_HIT" block=GL2C event=42 descr="Number of cache hits"></metric>
|
||||
<metric name="GL2C_MISS" block=GL2C event=43 descr="Number of cache misses. UC reads count as misses."></metric>
|
||||
<metric name="GL2C_MC_WRREQ" block=GL2C event=83 descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands"></metric>
|
||||
<metric name="GL2C_EA_WRREQ_64B" block=GL2C event=85 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
|
||||
<metric name="GL2C_MC_WRREQ_STALL" block=GL2C event=88 descr="Number of cycles a write request was stalled."></metric>
|
||||
<metric name="GL2C_MC_RDREQ" block=GL2C event=96 descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte)."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_32B" block=GL2C event=99 descr="Number of 32-byte GL2C/EA read requests"></metric>
|
||||
<metric name="GL2C_EA_RDREQ_64B" block=GL2C event=100 descr="Number of 64-byte GL2C/EA read requests"></metric>
|
||||
<metric name="GL2C_EA_RDREQ_96B" block=GL2C event=101 descr="Number of 96-byte GL2C/EA read requests"></metric>
|
||||
<metric name="GL2C_EA_RDREQ_128B" block=GL2C event=102 descr="Number of 128-byte GL2C/EA read requests"></metric>
|
||||
|
||||
<metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1."></metric>
|
||||
<metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. {nondeterministic, global, C2}"></metric>
|
||||
<metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. {emulated, global, C1}"></metric>
|
||||
<metric name="SQ_WAVE_CYCLES" block=SQ event=24 descr="Number of clock cycles spent by waves in the SQs. Incremented by number of living (valid) waves each cycle. {nondeterministic, C1}"></metric>
|
||||
<metric name="SQ_WAIT_INST_ANY" block=SQ event=26 descr="Number of clock-cycles spent waiting for any instruction issue. In units of cycles. (nondeterministic)"></metric>
|
||||
<metric name="SQ_WAIT_ANY" block=SQ event=35 descr="Number of wave-cycles spent waiting for anything (nondeterministic, C1)"></metric>
|
||||
<metric name="SQ_INSTS_WAVE32" block=SQ event=70 descr="Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_WAVE32_LDS" block=SQ event=72 descr="Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_WAVE32_VALU" block=SQ event=73 descr="Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_WAVE32_INSTS" block=SQ event=82 descr="Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}"></metric>
|
||||
<metric name="SQ_WAVE64_INSTS" block=SQ event=83 descr="Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}"></metric>
|
||||
<metric name="SQ_INST_LEVEL_GDS" block=SQ event=87 descr="Number of in-flight GDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_GDS for average latency. {level, nondeterministic, C1}"></metric>
|
||||
<metric name="SQ_INST_LEVEL_LDS" block=SQ event=88 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. {level, nondeterministic, C1}"></metric>
|
||||
<metric name="SQ_INST_CYCLES_VMEM" block=SQ event=106 descr="Number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1}"></metric>
|
||||
<metric name="SQC_LDS_BANK_CONFLICT" block=SQ event=256 descr="Number of cycles LDS is stalled by bank conflicts. (emulated, C1)"></metric>
|
||||
<metric name="SQC_LDS_IDX_ACTIVE" block=SQ event=261 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_VALU" block=SQ event=62 descr="Number of VALU instructions issued excluding skipped instructions. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_SALU" block=SQ event=58 descr="Number of SALU instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_SMEM" block=SQ event=59 descr="Number of SMEM instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_FLAT" block=SQ event=56 descr="Number of FLAT instructions issued. {emulated, C2}"></metric>
|
||||
<metric name="SQ_INSTS_LDS" block=SQ event=57 descr="Number of LDS indexed instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_GDS" block=SQ event=54 descr="Number of GDS instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_TEX_LOAD" block=SQ event=66 descr="Number of buffer load, image load, sample, or atomic (with return) instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_INSTS_TEX_STORE" block=SQ event=67 descr="Number of buffer store, image store, or atomic (without return) instructions issued. {emulated, C1}"></metric>
|
||||
<metric name="SQ_WAIT_INST_LDS" block=SQ event=29 descr="Number of clock cycles spent waiting for LDS (indexed) instruction issue. In units of cycles. {nondeterministic, C1}"></metric>
|
||||
<metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
|
||||
<metric name="TA_BUFFER_LOAD_WAVEFRONTS" block=TA event=45 descr="Number of buffer load vec32 packets processed by TA"></metric>
|
||||
<metric name="TA_BUFFER_STORE_WAVEFRONTS" block=TA event=46 descr="Number of buffer store vec32 packets processed by TA"></metric>
|
||||
|
||||
</gfx12>
|
||||
|
||||
<gfx1200 base="gfx12"></gfx1200>
|
||||
<gfx1201 base="gfx12"></gfx1201>
|
||||
|
||||
|
||||
@@ -435,36 +435,6 @@
|
||||
<metric name="OccupancyPercent" expr=100*SQ_WAVE_CYCLES/GRBM_GUI_ACTIVE/CU_NUM/32 descr="GPU occupancy as % of maximum."></metric>
|
||||
</gfx11_expr>
|
||||
|
||||
<gfx12_expr>
|
||||
<metric name="GPU_UTIL" expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT descr="Percentage of the time that GUI is active"></metric>
|
||||
<metric name="WAVE_DEP_WAIT" expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
|
||||
<metric name="WAVE_ISSUE_WAIT" expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
|
||||
|
||||
<metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
|
||||
<metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
|
||||
<metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
|
||||
<metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr=sum(TA_BUFFER_LOAD_WAVEFRONTS,16) descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
|
||||
<metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr=sum(TA_BUFFER_STORE_WAVEFRONTS,16) descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
|
||||
|
||||
<metric name="GL2C_HIT_sum" expr=sum(GL2C_HIT,16) descr="Number of cache hits. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MISS_sum" expr=sum(GL2C_MISS,16) descr="Number of cache misses. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_32B_sum" expr=sum(GL2C_EA_RDREQ_32B,16) descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_64B_sum" expr=sum(GL2C_EA_RDREQ_64B,16) descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_96B_sum" expr=sum(GL2C_EA_RDREQ_96B,16) descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_RDREQ_128B_sum" expr=sum(GL2C_EA_RDREQ_128B,16) descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MC_RDREQ_sum" expr=sum(GL2C_MC_RDREQ,16) descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_MC_WRREQ_sum" expr=sum(GL2C_MC_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_EA_WRREQ_64B_sum" expr=sum(GL2C_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
|
||||
<metric name="GL2C_WRREQ_STALL_max" expr=max(GL2C_MC_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
|
||||
<metric name="L2CacheHit" expr=100*sum(GL2C_HIT,16)/(sum(GL2C_HIT,16)+sum(GL2C_MISS,16)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
|
||||
<metric name="FETCH_SIZE" expr=(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
|
||||
<metric name="WriteUnitStalled" expr=100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
|
||||
<metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
|
||||
</gfx12_expr>
|
||||
|
||||
<gfx1200_expr base="gfx12_expr"></gfx1200_expr>
|
||||
<gfx1201_expr base="gfx12_expr"></gfx1201_expr>
|
||||
|
||||
<gfx1100_expr base="gfx11_expr"></gfx1100_expr>
|
||||
<gfx1101_expr base="gfx11_expr"></gfx1101_expr>
|
||||
<gfx1102_expr base="gfx11_expr"></gfx1102_expr>
|
||||
@@ -506,9 +476,6 @@
|
||||
<gfx1150 base="gfx1150_expr"></gfx1150>
|
||||
<gfx1151 base="gfx1151_expr"></gfx1151>
|
||||
|
||||
<gfx1200 base="gfx1200_expr"></gfx1200>
|
||||
<gfx1201 base="gfx1201_expr"></gfx1201>
|
||||
|
||||
<global>
|
||||
# GPUBusy The percentage of time GPU was busy.
|
||||
<metric
|
||||
|
||||
@@ -308,7 +308,7 @@ function(generate_hsaco TARGET_ID INPUT_FILE OUTPUT_FILE)
|
||||
PARENT_SCOPE)
|
||||
endfunction(generate_hsaco)
|
||||
|
||||
set(GPU_LIST "gfx900" "gfx906" "gfx908" "gfx90a" "gfx942" "gfx1030" "gfx1031" "gfx1032" "gfx1100" "gfx1101" "gfx1102" "gfx1150" "gfx1151" "gfx1200" "gfx1201")
|
||||
set(GPU_LIST "gfx900" "gfx906" "gfx908" "gfx90a" "gfx942" "gfx1030" "gfx1031" "gfx1032" "gfx1100" "gfx1101" "gfx1102" "gfx1150" "gfx1151")
|
||||
foreach(target_id ${GPU_LIST})
|
||||
# generate kernel bitcodes
|
||||
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/apps/copy.cl
|
||||
|
||||
@@ -519,8 +519,7 @@ TEST_F(LoadUnloadTest, WhenLoadingFirstTimeThenToolLoadsUnloadsSuccessfully) {
|
||||
std::string agentname;
|
||||
agentname.resize(64);
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agentname.data());
|
||||
if ((agentname.find("gfx11") != std::string::npos) ||
|
||||
(agentname.find("gfx12") != std::string::npos))
|
||||
if ((agentname.find("gfx11") != std::string::npos))
|
||||
bSkipCounterNoneZeroCheck = true;
|
||||
|
||||
hsa_device_type_t type;
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário