Add vcopy workloads for MI300A and MI300X
Signed-off-by: coleramos425 <colramos@amd.com>
[ROCm/rocprofiler-compute commit: c079f1330b]
Este commit está contenido en:
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,SQ_WAVES,SQ_IFETCH,SQ_IFETCH_LEVEL,SQ_ACCUM_PREV_HIRES
|
||||
0,11995,1,239059,239059,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,208371.0,208371.0,16384.0,65536.0,27933.0,2268960.0
|
||||
1,11995,1,239059,239059,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,200939.0,200939.0,16384.0,65536.0,13020.0,1048740.0
|
||||
2,11995,1,239059,239059,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,177122.0,177122.0,16384.0,65536.0,13215.0,1049244.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_LDS,SQ_INST_LEVEL_LDS,SQ_ACCUM_PREV_HIRES
|
||||
0,11995,1,239070,239070,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,0.0,0.0,0.0
|
||||
1,11995,1,239070,239070,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,0.0,0.0,0.0
|
||||
2,11995,1,239070,239070,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,0.0,0.0,0.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_SMEM,SQ_INST_LEVEL_SMEM,SQ_ACCUM_PREV_HIRES
|
||||
0,11995,1,239081,239081,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,65536.0,282426.0,22607296.0
|
||||
1,11995,1,239081,239081,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,65536.0,262012.0,20878056.0
|
||||
2,11995,1,239081,239081,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,65536.0,221574.0,17737216.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_VMEM,SQ_INST_LEVEL_VMEM,SQ_ACCUM_PREV_HIRES
|
||||
0,11995,1,239092,239092,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,32768.0,525409.0,42022704.0
|
||||
1,11995,1,239092,239092,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,32768.0,405541.0,32437816.0
|
||||
2,11995,1,239092,239092,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,32768.0,414783.0,33168224.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,CPC_ME1_BUSY_FOR_PACKET_DECODE,SQ_CYCLES,SQ_WAVES,SQ_WAVE_CYCLES,SQ_BUSY_CYCLES,SQ_LEVEL_WAVES,SQ_ACCUM_PREV_HIRES
|
||||
0,11995,1,239103,239103,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,220553.0,220553.0,125881.0,882212.0,16384.0,13496981.0,243456.0,0.0,54413320.0
|
||||
1,11995,1,239103,239103,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,186745.0,186745.0,104964.0,746980.0,16384.0,10050383.0,189145.0,0.0,40592236.0
|
||||
2,11995,1,239103,239103,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,177319.0,177319.0,96370.0,709276.0,16384.0,9944060.0,186126.0,0.0,40164888.0
|
||||
|
@@ -0,0 +1,251 @@
|
||||
Omniperf version: 2.0.0
|
||||
Profiler choice: rocprofv2
|
||||
Path: /home/colramos/omniperf/tests/workloads/vcopy/MI300A_A1
|
||||
Target: MI300A_A1
|
||||
Command: tests/vcopy -n 1048576 -b 256 -i 3
|
||||
Kernel Selection: None
|
||||
Dispatch Selection: None
|
||||
Hardware Blocks: All
|
||||
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Collecting Performance Counters
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_IFETCH_LEVEL.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH_LEVEL
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_INST_LEVEL_LDS.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_LDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_LDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_INST_LEVEL_SMEM.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_SMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_INST_LEVEL_VMEM.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_LEVEL_WAVES.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - CPC_ME1_BUSY_FOR_PACKET_DECODE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVE_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CYCLES
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_0.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CU_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVE_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_CVT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM_WR
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_1.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SALU
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VSKIPPED
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F16
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_10.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_DATA_ATOMIC_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_STALL
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_HITS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_MISSES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_MISSES_DUPLICATE
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_11.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_INPUT_VALID_READYB
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_ATOMIC
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_8
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_HITS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_MISSES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_MISSES_DUPLICATE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_1
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_12.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_2
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_4
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] sw thinks it moved 1.000000 KB per wave
|
||||
|-> [/opt/rocm/bin/rocprofv2] Total threads: 1048576, Grid Size: 4096 block Size:256, Wavefronts:16384:
|
||||
|-> [/opt/rocm/bin/rocprofv2] Launching the kernel on the GPU
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_13.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_BUBBLE[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_CYCLE[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_BUBBLE[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_CYCLE[1]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_14.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[3]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[3]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_15.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[2]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[3]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[3]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[3]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[3]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[4]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[4]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[4]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[4]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[5]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[5]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[5]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[5]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_16.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_MISS[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_READ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_REQ[0]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_17.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TOO_MANY_EA_WRREQS_STALL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_WRITE[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TOO_MANY_EA_WRREQS_STALL[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_WRITE[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[2]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_2.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_FMA_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F64
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_3.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_FLAT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_LDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_GDS
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_4.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_BRANCH
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SENDMSG
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_ANY
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_INST_ANY
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_ANY
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_LDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_VALU
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_5.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_SCA
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_EXP_GDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_MISC
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_6.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_THREAD_CYCLES_VALU
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_BANK_CONFLICT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_ADDR_CONFLICT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_UNALIGNED_STALL
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_EQ_64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_48
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_7.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ITEMS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_MEM_VIOLATIONS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_ATOMIC_RETURN
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_8.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM_NORM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_MFMA
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_I8
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_BF16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F64
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_9.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_I8
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_BF16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_INST_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_DATA_READ_REQ
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/timestamps.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
|
||||
[roofline] Roofline temporarily disabled in MI300
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_IFETCH SQ_IFETCH_LEVEL SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_LDS SQ_INST_LEVEL_LDS SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_SMEM SQ_INST_LEVEL_SMEM SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VMEM SQ_INST_LEVEL_VMEM SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: GRBM_COUNT GRBM_GUI_ACTIVE CPC_ME1_BUSY_FOR_PACKET_DECODE SQ_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_BUSY_CYCLES SQ_LEVEL_WAVES SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_CYCLES SQ_BUSY_CYCLES SQ_BUSY_CU_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_INSTS_VALU_CVT SQ_INSTS_VMEM_WR SQ_INSTS_VMEM_RD GRBM_COUNT GRBM_GUI_ACTIVE TCP_GATE_EN1_sum TCP_GATE_EN2_sum TCP_TD_TCP_STALL_CYCLES_sum TCP_TCR_TCP_STALL_CYCLES_sum TA_TA_BUSY_sum TA_BUFFER_WAVEFRONTS_sum TD_TD_BUSY_sum TD_TC_STALL_sum SPI_CSN_WINDOW_VALID SPI_CSN_BUSY CPC_CPC_STAT_BUSY CPC_CPC_STAT_IDLE CPF_CPF_STAT_BUSY CPF_CPF_STAT_STALL TCC_CYCLE_sum TCC_BUSY_sum TCC_PROBE_sum TCC_PROBE_ALL_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VMEM SQ_INSTS_SALU SQ_INSTS_VSKIPPED SQ_INSTS SQ_INSTS_VALU SQ_INSTS_VALU_ADD_F16 SQ_INSTS_VALU_MUL_F16 SQ_INSTS_VALU_FMA_F16 GRBM_SPI_BUSY TCP_READ_TAGCONFLICT_STALL_CYCLES_sum TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum TCP_TA_TCP_STATE_READ_sum TA_BUFFER_READ_WAVEFRONTS_sum TA_BUFFER_WRITE_WAVEFRONTS_sum TD_SPI_STALL_sum TD_LOAD_WAVEFRONT_sum SPI_CSN_NUM_THREADGROUPS SPI_CSN_WAVE CPC_CPC_TCIU_BUSY CPC_CPC_TCIU_IDLE CPF_CPF_TCIU_BUSY CPF_CPF_TCIU_STALL TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum TCC_RW_REQ_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQC_TC_DATA_ATOMIC_REQ SQC_TC_STALL SQC_TC_REQ SQC_DCACHE_REQ_READ_16 SQC_ICACHE_REQ SQC_ICACHE_HITS SQC_ICACHE_MISSES SQC_ICACHE_MISSES_DUPLICATE
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQC_DCACHE_INPUT_VALID_READYB SQC_DCACHE_ATOMIC SQC_DCACHE_REQ_READ_8 SQC_DCACHE_REQ SQC_DCACHE_HITS SQC_DCACHE_MISSES SQC_DCACHE_MISSES_DUPLICATE SQC_DCACHE_REQ_READ_1
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQC_DCACHE_REQ_READ_2 SQC_DCACHE_REQ_READ_4
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_ATOMIC[0] TCC_BUBBLE[0] TCC_CYCLE[0] TCC_EA0_ATOMIC[0] TCC_ATOMIC[1] TCC_BUBBLE[1] TCC_CYCLE[1] TCC_EA0_ATOMIC[1] TCC_ATOMIC[2] TCC_BUBBLE[2] TCC_CYCLE[2] TCC_EA0_ATOMIC[2] TCC_ATOMIC[3] TCC_BUBBLE[3] TCC_CYCLE[3] TCC_EA0_ATOMIC[3] TCC_ATOMIC[4] TCC_BUBBLE[4] TCC_CYCLE[4] TCC_EA0_ATOMIC[4] TCC_ATOMIC[5] TCC_BUBBLE[5] TCC_CYCLE[5] TCC_EA0_ATOMIC[5] TCC_ATOMIC[6] TCC_BUBBLE[6] TCC_CYCLE[6] TCC_EA0_ATOMIC[6] TCC_ATOMIC[7] TCC_BUBBLE[7] TCC_CYCLE[7] TCC_EA0_ATOMIC[7] TCC_ATOMIC[8] TCC_BUBBLE[8] TCC_CYCLE[8] TCC_EA0_ATOMIC[8] TCC_ATOMIC[9] TCC_BUBBLE[9] TCC_CYCLE[9] TCC_EA0_ATOMIC[9] TCC_ATOMIC[10] TCC_BUBBLE[10] TCC_CYCLE[10] TCC_EA0_ATOMIC[10] TCC_ATOMIC[11] TCC_BUBBLE[11] TCC_CYCLE[11] TCC_EA0_ATOMIC[11] TCC_ATOMIC[12] TCC_BUBBLE[12] TCC_CYCLE[12] TCC_EA0_ATOMIC[12] TCC_ATOMIC[13] TCC_BUBBLE[13] TCC_CYCLE[13] TCC_EA0_ATOMIC[13] TCC_ATOMIC[14] TCC_BUBBLE[14] TCC_CYCLE[14] TCC_EA0_ATOMIC[14] TCC_ATOMIC[15] TCC_BUBBLE[15] TCC_CYCLE[15] TCC_EA0_ATOMIC[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_EA0_ATOMIC_LEVEL[0] TCC_EA0_RDREQ[0] TCC_EA0_RDREQ_32B[0] TCC_EA0_RDREQ_LEVEL[0] TCC_EA0_ATOMIC_LEVEL[1] TCC_EA0_RDREQ[1] TCC_EA0_RDREQ_32B[1] TCC_EA0_RDREQ_LEVEL[1] TCC_EA0_ATOMIC_LEVEL[2] TCC_EA0_RDREQ[2] TCC_EA0_RDREQ_32B[2] TCC_EA0_RDREQ_LEVEL[2] TCC_EA0_ATOMIC_LEVEL[3] TCC_EA0_RDREQ[3] TCC_EA0_RDREQ_32B[3] TCC_EA0_RDREQ_LEVEL[3] TCC_EA0_ATOMIC_LEVEL[4] TCC_EA0_RDREQ[4] TCC_EA0_RDREQ_32B[4] TCC_EA0_RDREQ_LEVEL[4] TCC_EA0_ATOMIC_LEVEL[5] TCC_EA0_RDREQ[5] TCC_EA0_RDREQ_32B[5] TCC_EA0_RDREQ_LEVEL[5] TCC_EA0_ATOMIC_LEVEL[6] TCC_EA0_RDREQ[6] TCC_EA0_RDREQ_32B[6] TCC_EA0_RDREQ_LEVEL[6] TCC_EA0_ATOMIC_LEVEL[7] TCC_EA0_RDREQ[7] TCC_EA0_RDREQ_32B[7] TCC_EA0_RDREQ_LEVEL[7] TCC_EA0_ATOMIC_LEVEL[8] TCC_EA0_RDREQ[8] TCC_EA0_RDREQ_32B[8] TCC_EA0_RDREQ_LEVEL[8] TCC_EA0_ATOMIC_LEVEL[9] TCC_EA0_RDREQ[9] TCC_EA0_RDREQ_32B[9] TCC_EA0_RDREQ_LEVEL[9] TCC_EA0_ATOMIC_LEVEL[10] TCC_EA0_RDREQ[10] TCC_EA0_RDREQ_32B[10] TCC_EA0_RDREQ_LEVEL[10] TCC_EA0_ATOMIC_LEVEL[11] TCC_EA0_RDREQ[11] TCC_EA0_RDREQ_32B[11] TCC_EA0_RDREQ_LEVEL[11] TCC_EA0_ATOMIC_LEVEL[12] TCC_EA0_RDREQ[12] TCC_EA0_RDREQ_32B[12] TCC_EA0_RDREQ_LEVEL[12] TCC_EA0_ATOMIC_LEVEL[13] TCC_EA0_RDREQ[13] TCC_EA0_RDREQ_32B[13] TCC_EA0_RDREQ_LEVEL[13] TCC_EA0_ATOMIC_LEVEL[14] TCC_EA0_RDREQ[14] TCC_EA0_RDREQ_32B[14] TCC_EA0_RDREQ_LEVEL[14] TCC_EA0_ATOMIC_LEVEL[15] TCC_EA0_RDREQ[15] TCC_EA0_RDREQ_32B[15] TCC_EA0_RDREQ_LEVEL[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_EA0_WRREQ[0] TCC_EA0_WRREQ_64B[0] TCC_EA0_WRREQ_LEVEL[0] TCC_HIT[0] TCC_EA0_WRREQ[1] TCC_EA0_WRREQ_64B[1] TCC_EA0_WRREQ_LEVEL[1] TCC_HIT[1] TCC_EA0_WRREQ[2] TCC_EA0_WRREQ_64B[2] TCC_EA0_WRREQ_LEVEL[2] TCC_HIT[2] TCC_EA0_WRREQ[3] TCC_EA0_WRREQ_64B[3] TCC_EA0_WRREQ_LEVEL[3] TCC_HIT[3] TCC_EA0_WRREQ[4] TCC_EA0_WRREQ_64B[4] TCC_EA0_WRREQ_LEVEL[4] TCC_HIT[4] TCC_EA0_WRREQ[5] TCC_EA0_WRREQ_64B[5] TCC_EA0_WRREQ_LEVEL[5] TCC_HIT[5] TCC_EA0_WRREQ[6] TCC_EA0_WRREQ_64B[6] TCC_EA0_WRREQ_LEVEL[6] TCC_HIT[6] TCC_EA0_WRREQ[7] TCC_EA0_WRREQ_64B[7] TCC_EA0_WRREQ_LEVEL[7] TCC_HIT[7] TCC_EA0_WRREQ[8] TCC_EA0_WRREQ_64B[8] TCC_EA0_WRREQ_LEVEL[8] TCC_HIT[8] TCC_EA0_WRREQ[9] TCC_EA0_WRREQ_64B[9] TCC_EA0_WRREQ_LEVEL[9] TCC_HIT[9] TCC_EA0_WRREQ[10] TCC_EA0_WRREQ_64B[10] TCC_EA0_WRREQ_LEVEL[10] TCC_HIT[10] TCC_EA0_WRREQ[11] TCC_EA0_WRREQ_64B[11] TCC_EA0_WRREQ_LEVEL[11] TCC_HIT[11] TCC_EA0_WRREQ[12] TCC_EA0_WRREQ_64B[12] TCC_EA0_WRREQ_LEVEL[12] TCC_HIT[12] TCC_EA0_WRREQ[13] TCC_EA0_WRREQ_64B[13] TCC_EA0_WRREQ_LEVEL[13] TCC_HIT[13] TCC_EA0_WRREQ[14] TCC_EA0_WRREQ_64B[14] TCC_EA0_WRREQ_LEVEL[14] TCC_HIT[14] TCC_EA0_WRREQ[15] TCC_EA0_WRREQ_64B[15] TCC_EA0_WRREQ_LEVEL[15] TCC_HIT[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_MISS[0] TCC_READ[0] TCC_REQ[0] TCC_RW_REQ[0] TCC_MISS[1] TCC_READ[1] TCC_REQ[1] TCC_RW_REQ[1] TCC_MISS[2] TCC_READ[2] TCC_REQ[2] TCC_RW_REQ[2] TCC_MISS[3] TCC_READ[3] TCC_REQ[3] TCC_RW_REQ[3] TCC_MISS[4] TCC_READ[4] TCC_REQ[4] TCC_RW_REQ[4] TCC_MISS[5] TCC_READ[5] TCC_REQ[5] TCC_RW_REQ[5] TCC_MISS[6] TCC_READ[6] TCC_REQ[6] TCC_RW_REQ[6] TCC_MISS[7] TCC_READ[7] TCC_REQ[7] TCC_RW_REQ[7] TCC_MISS[8] TCC_READ[8] TCC_REQ[8] TCC_RW_REQ[8] TCC_MISS[9] TCC_READ[9] TCC_REQ[9] TCC_RW_REQ[9] TCC_MISS[10] TCC_READ[10] TCC_REQ[10] TCC_RW_REQ[10] TCC_MISS[11] TCC_READ[11] TCC_REQ[11] TCC_RW_REQ[11] TCC_MISS[12] TCC_READ[12] TCC_REQ[12] TCC_RW_REQ[12] TCC_MISS[13] TCC_READ[13] TCC_REQ[13] TCC_RW_REQ[13] TCC_MISS[14] TCC_READ[14] TCC_REQ[14] TCC_RW_REQ[14] TCC_MISS[15] TCC_READ[15] TCC_REQ[15] TCC_RW_REQ[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_TAG_STALL[0] TCC_TOO_MANY_EA_WRREQS_STALL[0] TCC_WRITE[0] TCC_TAG_STALL[1] TCC_TOO_MANY_EA_WRREQS_STALL[1] TCC_WRITE[1] TCC_TAG_STALL[2] TCC_TOO_MANY_EA_WRREQS_STALL[2] TCC_WRITE[2] TCC_TAG_STALL[3] TCC_TOO_MANY_EA_WRREQS_STALL[3] TCC_WRITE[3] TCC_TAG_STALL[4] TCC_TOO_MANY_EA_WRREQS_STALL[4] TCC_WRITE[4] TCC_TAG_STALL[5] TCC_TOO_MANY_EA_WRREQS_STALL[5] TCC_WRITE[5] TCC_TAG_STALL[6] TCC_TOO_MANY_EA_WRREQS_STALL[6] TCC_WRITE[6] TCC_TAG_STALL[7] TCC_TOO_MANY_EA_WRREQS_STALL[7] TCC_WRITE[7] TCC_TAG_STALL[8] TCC_TOO_MANY_EA_WRREQS_STALL[8] TCC_WRITE[8] TCC_TAG_STALL[9] TCC_TOO_MANY_EA_WRREQS_STALL[9] TCC_WRITE[9] TCC_TAG_STALL[10] TCC_TOO_MANY_EA_WRREQS_STALL[10] TCC_WRITE[10] TCC_TAG_STALL[11] TCC_TOO_MANY_EA_WRREQS_STALL[11] TCC_WRITE[11] TCC_TAG_STALL[12] TCC_TOO_MANY_EA_WRREQS_STALL[12] TCC_WRITE[12] TCC_TAG_STALL[13] TCC_TOO_MANY_EA_WRREQS_STALL[13] TCC_WRITE[13] TCC_TAG_STALL[14] TCC_TOO_MANY_EA_WRREQS_STALL[14] TCC_WRITE[14] TCC_TAG_STALL[15] TCC_TOO_MANY_EA_WRREQS_STALL[15] TCC_WRITE[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VALU_TRANS_F16 SQ_INSTS_VALU_ADD_F32 SQ_INSTS_VALU_MUL_F32 SQ_INSTS_VALU_FMA_F32 SQ_INSTS_VALU_TRANS_F32 SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MUL_F64 SQ_INSTS_VALU_FMA_F64 TCP_VOLATILE_sum TCP_TOTAL_ACCESSES_sum TCP_TOTAL_READ_sum TCP_TOTAL_WRITE_sum TA_BUFFER_ATOMIC_WAVEFRONTS_sum TA_BUFFER_TOTAL_CYCLES_sum TD_ATOMIC_WAVEFRONT_sum TD_STORE_WAVEFRONT_sum SPI_RA_REQ_NO_ALLOC SPI_RA_REQ_NO_ALLOC_CSN CPC_CPC_STAT_STALL CPC_UTCL1_STALL_ON_TRANSLATION CPF_CPF_STAT_IDLE CPF_CPF_TCIU_IDLE TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VALU_TRANS_F64 SQ_INSTS_VALU_INT32 SQ_INSTS_VALU_INT64 SQ_INSTS_SMEM SQ_INSTS_FLAT SQ_INSTS_LDS SQ_INSTS_GDS SQ_INSTS_EXP_GDS TCP_TOTAL_ATOMIC_WITH_RET_sum TCP_TOTAL_ATOMIC_WITHOUT_RET_sum TCP_TOTAL_WRITEBACK_INVALIDATES_sum TCP_TOTAL_CACHE_ACCESSES_sum TA_BUFFER_COALESCED_READ_CYCLES_sum TA_BUFFER_COALESCED_WRITE_CYCLES_sum TD_COALESCABLE_WAVEFRONT_sum SPI_RA_RES_STALL_CSN SPI_RA_TMP_STALL_CSN CPC_CPC_UTCL2IU_BUSY CPC_CPC_UTCL2IU_IDLE CPF_CMP_UTCL1_STALL_ON_TRANSLATION TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_BRANCH SQ_INSTS_SENDMSG SQ_WAIT_ANY SQ_WAIT_INST_ANY SQ_ACTIVE_INST_ANY SQ_ACTIVE_INST_VMEM SQ_ACTIVE_INST_LDS SQ_ACTIVE_INST_VALU TCP_UTCL1_TRANSLATION_MISS_sum TCP_UTCL1_TRANSLATION_HIT_sum TCP_UTCL1_PERMISSION_MISS_sum TCP_UTCL1_REQUEST_sum TA_ADDR_STALLED_BY_TC_CYCLES_sum TA_TOTAL_WAVEFRONTS_sum SPI_RA_WAVE_SIMD_FULL_CSN SPI_RA_VGPR_SIMD_FULL_CSN CPC_CPC_UTCL2IU_STALL CPC_ME1_BUSY_FOR_PACKET_DECODE TCC_EA0_WRREQ_sum TCC_EA0_WRREQ_64B_sum TCC_EA0_WR_UNCACHED_32B_sum TCC_EA0_WRREQ_DRAM_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_ACTIVE_INST_SCA SQ_ACTIVE_INST_EXP_GDS SQ_ACTIVE_INST_MISC SQ_ACTIVE_INST_FLAT SQ_INST_CYCLES_VMEM_WR SQ_INST_CYCLES_VMEM_RD SQ_INST_CYCLES_SMEM SQ_INST_CYCLES_SALU TCP_TCC_READ_REQ_sum TCP_TCC_WRITE_REQ_sum TCP_TCC_ATOMIC_WITH_RET_REQ_sum TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum TA_ADDR_STALLED_BY_TD_CYCLES_sum TA_DATA_STALLED_BY_TC_CYCLES_sum SPI_RA_SGPR_SIMD_FULL_CSN SPI_RA_LDS_CU_FULL_CSN CPC_ME1_DC0_SPI_BUSY TCC_EA0_RDREQ_sum TCC_EA0_RDREQ_32B_sum TCC_BUBBLE_sum TCC_EA0_RD_UNCACHED_32B_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_THREAD_CYCLES_VALU SQ_IFETCH SQ_LDS_BANK_CONFLICT SQ_LDS_ADDR_CONFLICT SQ_LDS_UNALIGNED_STALL SQ_WAVES_EQ_64 SQ_WAVES_LT_64 SQ_WAVES_LT_48 TCP_TCC_NC_READ_REQ_sum TCP_TCC_NC_WRITE_REQ_sum TCP_TCC_NC_ATOMIC_REQ_sum TCP_TCC_UC_READ_REQ_sum TA_FLAT_WAVEFRONTS_sum TA_FLAT_READ_WAVEFRONTS_sum SPI_RA_BAR_CU_FULL_CSN SPI_RA_TGLIM_CU_FULL_CSN TCC_EA0_RDREQ_DRAM_sum TCC_TAG_STALL_sum TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_WAVES_LT_32 SQ_WAVES_LT_16 SQ_ITEMS SQ_LDS_MEM_VIOLATIONS SQ_LDS_ATOMIC_RETURN SQ_LDS_IDX_ACTIVE SQ_WAVES_RESTORED SQ_WAVES_SAVED TCP_TCC_UC_WRITE_REQ_sum TCP_TCC_UC_ATOMIC_REQ_sum TCP_TCC_CC_READ_REQ_sum TCP_TCC_CC_WRITE_REQ_sum TA_FLAT_WRITE_WAVEFRONTS_sum TA_FLAT_ATOMIC_WAVEFRONTS_sum SPI_RA_WVLIM_STALL_CSN SPI_SWC_CSC_WR TCC_NORMAL_EVICT_sum TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA0_ATOMIC_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_SMEM_NORM SQ_INSTS_MFMA SQ_INSTS_VALU_MFMA_I8 SQ_INSTS_VALU_MFMA_F16 SQ_INSTS_VALU_MFMA_BF16 SQ_INSTS_VALU_MFMA_F32 SQ_INSTS_VALU_MFMA_F64 SQ_VALU_MFMA_BUSY_CYCLES TCP_TCC_CC_ATOMIC_REQ_sum TCP_TCC_RW_READ_REQ_sum TCP_TCC_RW_WRITE_REQ_sum TCP_TCC_RW_ATOMIC_REQ_sum SPI_VWC_CSC_WR SPI_RA_BULKY_CU_FULL_CSN TCC_EA0_RDREQ_LEVEL_sum TCC_EA0_WRREQ_LEVEL_sum TCC_EA0_ATOMIC_LEVEL_sum TCC_EA0_WRREQ_STALL_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VALU_MFMA_MOPS_I8 SQ_INSTS_VALU_MFMA_MOPS_F16 SQ_INSTS_VALU_MFMA_MOPS_BF16 SQ_INSTS_VALU_MFMA_MOPS_F32 SQ_INSTS_VALU_MFMA_MOPS_F64 SQC_TC_INST_REQ SQC_TC_DATA_READ_REQ SQC_TC_DATA_WRITE_REQ TCP_PENDING_STALL_CYCLES_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc:
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
Las diferiencias del archivo han sido suprimidas porque una o mas lineas son muy largas
@@ -0,0 +1,2 @@
|
||||
workload_name,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_model,gpu_arch,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,hbm_bw,num_xcd
|
||||
vcopy,tests/vcopy -n 1048576 -b 256 -i 3,SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Thu 30 May 2024 02:09:51 PM (CDT),2,sh5-1w300-rg3-3,AMD Instinct MI300A Accelerator,"American Megatrends International, LLC.RMO1002DS",Ubuntu 22.04.2 LTS,5.18.2-mi300-build-140423-ubuntu-22.04+,,131174852,,6.1.2-110,N/A,SPX,NPS1,MI300A_A1,gfx942,32,24576,228,4,24,64,1024,32,2100,1300,2100,1300,96,32,120,4,5324.8,6
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID
|
||||
1,11995,1,239313,239313,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0
|
||||
3,11995,1,239313,239313,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0
|
||||
2,11995,1,239313,239313,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,SQ_WAVES,SQ_IFETCH,SQ_IFETCH_LEVEL,SQ_ACCUM_PREV_HIRES
|
||||
0,60633,1,1102641,1102641,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,449213.0,449213.0,16384.0,65536.0,40694.0,3249356.0
|
||||
1,60633,1,1102641,1102641,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,421528.0,421528.0,16384.0,65536.0,13164.0,1048588.0
|
||||
2,60633,1,1102641,1102641,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,431238.0,431238.0,16384.0,65536.0,13128.0,1048576.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_LDS,SQ_INST_LEVEL_LDS,SQ_ACCUM_PREV_HIRES
|
||||
0,60633,1,1102667,1102667,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,0.0,0.0,0.0
|
||||
1,60633,1,1102667,1102667,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,0.0,0.0,0.0
|
||||
2,60633,1,1102667,1102667,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,0.0,0.0,0.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_SMEM,SQ_INST_LEVEL_SMEM,SQ_ACCUM_PREV_HIRES
|
||||
0,60633,1,1102685,1102685,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,65536.0,3692420.0,295366648.0
|
||||
1,60633,1,1102685,1102685,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,65536.0,3741276.0,299325624.0
|
||||
2,60633,1,1102685,1102685,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,65536.0,3495126.0,279581920.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_VMEM,SQ_INST_LEVEL_VMEM,SQ_ACCUM_PREV_HIRES
|
||||
0,60633,1,1102722,1102722,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,32768.0,504073.0,40323912.0
|
||||
1,60633,1,1102722,1102722,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,32768.0,369531.0,29558556.0
|
||||
2,60633,1,1102722,1102722,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,32768.0,402918.0,32234036.0
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,CPC_ME1_BUSY_FOR_PACKET_DECODE,SQ_CYCLES,SQ_WAVES,SQ_WAVE_CYCLES,SQ_BUSY_CYCLES,SQ_LEVEL_WAVES,SQ_ACCUM_PREV_HIRES
|
||||
0,60633,1,1102733,1102733,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,486977.0,486977.0,287597.0,1947908.0,16384.0,38712781.0,643368.0,0.0,155189876.0
|
||||
1,60633,1,1102733,1102733,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,453307.0,453307.0,266894.0,1813228.0,16384.0,34185086.0,537047.0,0.0,137095332.0
|
||||
2,60633,1,1102733,1102733,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,417367.0,417367.0,227048.0,1669468.0,16384.0,35398046.0,555090.0,0.0,141949588.0
|
||||
|
@@ -0,0 +1,161 @@
|
||||
Omniperf version: 2.0.0
|
||||
Profiler choice: rocprofv2
|
||||
Path: /home/colramos/omniperf/tests/workloads/vcopy/MI300X_A1
|
||||
Target: MI300X_A1
|
||||
Command: ./tests/vcopy -n 1048576 -b 256 -i 3
|
||||
Kernel Selection: None
|
||||
Dispatch Selection: None
|
||||
Hardware Blocks: All
|
||||
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Collecting Performance Counters
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH_LEVEL
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_LDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_LDS
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_SMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - CPC_ME1_BUSY_FOR_PACKET_DECODE
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CYCLES
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CU_CYCLES
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SALU
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VSKIPPED
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_10.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_DATA_ATOMIC_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_STALL
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_REQ
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_REQ
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_11.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_INPUT_VALID_READYB
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_ATOMIC
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_12.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_2
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_4
|
||||
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_13.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_BUBBLE[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_CYCLE[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[1]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_14.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[1]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[1]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_15.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[0]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_16.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_MISS[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_READ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_REQ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_RW_REQ[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_MISS[1]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_17.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TOO_MANY_EA_WRREQS_STALL[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_WRITE[0]
|
||||
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[1]
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F32
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F64
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT64
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_BRANCH
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SENDMSG
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_ANY
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_INST_ANY
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_ANY
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_SCA
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_EXP_GDS
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_MISC
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_FLAT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_CYCLES_VMEM_WR
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_CYCLES_VMEM_RD
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_CYCLES_SMEM
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_THREAD_CYCLES_VALU
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_BANK_CONFLICT
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_ADDR_CONFLICT
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_7.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_32
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_ITEMS
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_8.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM_NORM
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_MFMA
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_I8
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F16
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_9.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_I8
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_BF16
|
||||
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F32
|
||||
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/timestamps.txt
|
||||
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|
||||
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
|
||||
[roofline] Roofline temporarily disabled in MI300
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_IFETCH SQ_IFETCH_LEVEL SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_LDS SQ_INST_LEVEL_LDS SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_SMEM SQ_INST_LEVEL_SMEM SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VMEM SQ_INST_LEVEL_VMEM SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
pmc: GRBM_COUNT GRBM_GUI_ACTIVE CPC_ME1_BUSY_FOR_PACKET_DECODE SQ_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_BUSY_CYCLES SQ_LEVEL_WAVES SQ_ACCUM_PREV_HIRES
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_CYCLES SQ_BUSY_CYCLES SQ_BUSY_CU_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_INSTS_VALU_CVT SQ_INSTS_VMEM_WR SQ_INSTS_VMEM_RD GRBM_COUNT GRBM_GUI_ACTIVE TCP_GATE_EN1_sum TCP_GATE_EN2_sum TCP_TD_TCP_STALL_CYCLES_sum TCP_TCR_TCP_STALL_CYCLES_sum TA_TA_BUSY_sum TA_BUFFER_WAVEFRONTS_sum TD_TD_BUSY_sum TD_TC_STALL_sum SPI_CSN_WINDOW_VALID SPI_CSN_BUSY CPC_CPC_STAT_BUSY CPC_CPC_STAT_IDLE CPF_CPF_STAT_BUSY CPF_CPF_STAT_STALL TCC_CYCLE_sum TCC_BUSY_sum TCC_PROBE_sum TCC_PROBE_ALL_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VMEM SQ_INSTS_SALU SQ_INSTS_VSKIPPED SQ_INSTS SQ_INSTS_VALU SQ_INSTS_VALU_ADD_F16 SQ_INSTS_VALU_MUL_F16 SQ_INSTS_VALU_FMA_F16 GRBM_SPI_BUSY TCP_READ_TAGCONFLICT_STALL_CYCLES_sum TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum TCP_TA_TCP_STATE_READ_sum TA_BUFFER_READ_WAVEFRONTS_sum TA_BUFFER_WRITE_WAVEFRONTS_sum TD_SPI_STALL_sum TD_LOAD_WAVEFRONT_sum SPI_CSN_NUM_THREADGROUPS SPI_CSN_WAVE CPC_CPC_TCIU_BUSY CPC_CPC_TCIU_IDLE CPF_CPF_TCIU_BUSY CPF_CPF_TCIU_STALL TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum TCC_RW_REQ_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQC_TC_DATA_ATOMIC_REQ SQC_TC_STALL SQC_TC_REQ SQC_DCACHE_REQ_READ_16 SQC_ICACHE_REQ SQC_ICACHE_HITS SQC_ICACHE_MISSES SQC_ICACHE_MISSES_DUPLICATE
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQC_DCACHE_INPUT_VALID_READYB SQC_DCACHE_ATOMIC SQC_DCACHE_REQ_READ_8 SQC_DCACHE_REQ SQC_DCACHE_HITS SQC_DCACHE_MISSES SQC_DCACHE_MISSES_DUPLICATE SQC_DCACHE_REQ_READ_1
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQC_DCACHE_REQ_READ_2 SQC_DCACHE_REQ_READ_4
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_ATOMIC[0] TCC_BUBBLE[0] TCC_CYCLE[0] TCC_EA0_ATOMIC[0] TCC_ATOMIC[1] TCC_BUBBLE[1] TCC_CYCLE[1] TCC_EA0_ATOMIC[1] TCC_ATOMIC[2] TCC_BUBBLE[2] TCC_CYCLE[2] TCC_EA0_ATOMIC[2] TCC_ATOMIC[3] TCC_BUBBLE[3] TCC_CYCLE[3] TCC_EA0_ATOMIC[3] TCC_ATOMIC[4] TCC_BUBBLE[4] TCC_CYCLE[4] TCC_EA0_ATOMIC[4] TCC_ATOMIC[5] TCC_BUBBLE[5] TCC_CYCLE[5] TCC_EA0_ATOMIC[5] TCC_ATOMIC[6] TCC_BUBBLE[6] TCC_CYCLE[6] TCC_EA0_ATOMIC[6] TCC_ATOMIC[7] TCC_BUBBLE[7] TCC_CYCLE[7] TCC_EA0_ATOMIC[7] TCC_ATOMIC[8] TCC_BUBBLE[8] TCC_CYCLE[8] TCC_EA0_ATOMIC[8] TCC_ATOMIC[9] TCC_BUBBLE[9] TCC_CYCLE[9] TCC_EA0_ATOMIC[9] TCC_ATOMIC[10] TCC_BUBBLE[10] TCC_CYCLE[10] TCC_EA0_ATOMIC[10] TCC_ATOMIC[11] TCC_BUBBLE[11] TCC_CYCLE[11] TCC_EA0_ATOMIC[11] TCC_ATOMIC[12] TCC_BUBBLE[12] TCC_CYCLE[12] TCC_EA0_ATOMIC[12] TCC_ATOMIC[13] TCC_BUBBLE[13] TCC_CYCLE[13] TCC_EA0_ATOMIC[13] TCC_ATOMIC[14] TCC_BUBBLE[14] TCC_CYCLE[14] TCC_EA0_ATOMIC[14] TCC_ATOMIC[15] TCC_BUBBLE[15] TCC_CYCLE[15] TCC_EA0_ATOMIC[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_EA0_ATOMIC_LEVEL[0] TCC_EA0_RDREQ[0] TCC_EA0_RDREQ_32B[0] TCC_EA0_RDREQ_LEVEL[0] TCC_EA0_ATOMIC_LEVEL[1] TCC_EA0_RDREQ[1] TCC_EA0_RDREQ_32B[1] TCC_EA0_RDREQ_LEVEL[1] TCC_EA0_ATOMIC_LEVEL[2] TCC_EA0_RDREQ[2] TCC_EA0_RDREQ_32B[2] TCC_EA0_RDREQ_LEVEL[2] TCC_EA0_ATOMIC_LEVEL[3] TCC_EA0_RDREQ[3] TCC_EA0_RDREQ_32B[3] TCC_EA0_RDREQ_LEVEL[3] TCC_EA0_ATOMIC_LEVEL[4] TCC_EA0_RDREQ[4] TCC_EA0_RDREQ_32B[4] TCC_EA0_RDREQ_LEVEL[4] TCC_EA0_ATOMIC_LEVEL[5] TCC_EA0_RDREQ[5] TCC_EA0_RDREQ_32B[5] TCC_EA0_RDREQ_LEVEL[5] TCC_EA0_ATOMIC_LEVEL[6] TCC_EA0_RDREQ[6] TCC_EA0_RDREQ_32B[6] TCC_EA0_RDREQ_LEVEL[6] TCC_EA0_ATOMIC_LEVEL[7] TCC_EA0_RDREQ[7] TCC_EA0_RDREQ_32B[7] TCC_EA0_RDREQ_LEVEL[7] TCC_EA0_ATOMIC_LEVEL[8] TCC_EA0_RDREQ[8] TCC_EA0_RDREQ_32B[8] TCC_EA0_RDREQ_LEVEL[8] TCC_EA0_ATOMIC_LEVEL[9] TCC_EA0_RDREQ[9] TCC_EA0_RDREQ_32B[9] TCC_EA0_RDREQ_LEVEL[9] TCC_EA0_ATOMIC_LEVEL[10] TCC_EA0_RDREQ[10] TCC_EA0_RDREQ_32B[10] TCC_EA0_RDREQ_LEVEL[10] TCC_EA0_ATOMIC_LEVEL[11] TCC_EA0_RDREQ[11] TCC_EA0_RDREQ_32B[11] TCC_EA0_RDREQ_LEVEL[11] TCC_EA0_ATOMIC_LEVEL[12] TCC_EA0_RDREQ[12] TCC_EA0_RDREQ_32B[12] TCC_EA0_RDREQ_LEVEL[12] TCC_EA0_ATOMIC_LEVEL[13] TCC_EA0_RDREQ[13] TCC_EA0_RDREQ_32B[13] TCC_EA0_RDREQ_LEVEL[13] TCC_EA0_ATOMIC_LEVEL[14] TCC_EA0_RDREQ[14] TCC_EA0_RDREQ_32B[14] TCC_EA0_RDREQ_LEVEL[14] TCC_EA0_ATOMIC_LEVEL[15] TCC_EA0_RDREQ[15] TCC_EA0_RDREQ_32B[15] TCC_EA0_RDREQ_LEVEL[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_EA0_WRREQ[0] TCC_EA0_WRREQ_64B[0] TCC_EA0_WRREQ_LEVEL[0] TCC_HIT[0] TCC_EA0_WRREQ[1] TCC_EA0_WRREQ_64B[1] TCC_EA0_WRREQ_LEVEL[1] TCC_HIT[1] TCC_EA0_WRREQ[2] TCC_EA0_WRREQ_64B[2] TCC_EA0_WRREQ_LEVEL[2] TCC_HIT[2] TCC_EA0_WRREQ[3] TCC_EA0_WRREQ_64B[3] TCC_EA0_WRREQ_LEVEL[3] TCC_HIT[3] TCC_EA0_WRREQ[4] TCC_EA0_WRREQ_64B[4] TCC_EA0_WRREQ_LEVEL[4] TCC_HIT[4] TCC_EA0_WRREQ[5] TCC_EA0_WRREQ_64B[5] TCC_EA0_WRREQ_LEVEL[5] TCC_HIT[5] TCC_EA0_WRREQ[6] TCC_EA0_WRREQ_64B[6] TCC_EA0_WRREQ_LEVEL[6] TCC_HIT[6] TCC_EA0_WRREQ[7] TCC_EA0_WRREQ_64B[7] TCC_EA0_WRREQ_LEVEL[7] TCC_HIT[7] TCC_EA0_WRREQ[8] TCC_EA0_WRREQ_64B[8] TCC_EA0_WRREQ_LEVEL[8] TCC_HIT[8] TCC_EA0_WRREQ[9] TCC_EA0_WRREQ_64B[9] TCC_EA0_WRREQ_LEVEL[9] TCC_HIT[9] TCC_EA0_WRREQ[10] TCC_EA0_WRREQ_64B[10] TCC_EA0_WRREQ_LEVEL[10] TCC_HIT[10] TCC_EA0_WRREQ[11] TCC_EA0_WRREQ_64B[11] TCC_EA0_WRREQ_LEVEL[11] TCC_HIT[11] TCC_EA0_WRREQ[12] TCC_EA0_WRREQ_64B[12] TCC_EA0_WRREQ_LEVEL[12] TCC_HIT[12] TCC_EA0_WRREQ[13] TCC_EA0_WRREQ_64B[13] TCC_EA0_WRREQ_LEVEL[13] TCC_HIT[13] TCC_EA0_WRREQ[14] TCC_EA0_WRREQ_64B[14] TCC_EA0_WRREQ_LEVEL[14] TCC_HIT[14] TCC_EA0_WRREQ[15] TCC_EA0_WRREQ_64B[15] TCC_EA0_WRREQ_LEVEL[15] TCC_HIT[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_MISS[0] TCC_READ[0] TCC_REQ[0] TCC_RW_REQ[0] TCC_MISS[1] TCC_READ[1] TCC_REQ[1] TCC_RW_REQ[1] TCC_MISS[2] TCC_READ[2] TCC_REQ[2] TCC_RW_REQ[2] TCC_MISS[3] TCC_READ[3] TCC_REQ[3] TCC_RW_REQ[3] TCC_MISS[4] TCC_READ[4] TCC_REQ[4] TCC_RW_REQ[4] TCC_MISS[5] TCC_READ[5] TCC_REQ[5] TCC_RW_REQ[5] TCC_MISS[6] TCC_READ[6] TCC_REQ[6] TCC_RW_REQ[6] TCC_MISS[7] TCC_READ[7] TCC_REQ[7] TCC_RW_REQ[7] TCC_MISS[8] TCC_READ[8] TCC_REQ[8] TCC_RW_REQ[8] TCC_MISS[9] TCC_READ[9] TCC_REQ[9] TCC_RW_REQ[9] TCC_MISS[10] TCC_READ[10] TCC_REQ[10] TCC_RW_REQ[10] TCC_MISS[11] TCC_READ[11] TCC_REQ[11] TCC_RW_REQ[11] TCC_MISS[12] TCC_READ[12] TCC_REQ[12] TCC_RW_REQ[12] TCC_MISS[13] TCC_READ[13] TCC_REQ[13] TCC_RW_REQ[13] TCC_MISS[14] TCC_READ[14] TCC_REQ[14] TCC_RW_REQ[14] TCC_MISS[15] TCC_READ[15] TCC_REQ[15] TCC_RW_REQ[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: TCC_TAG_STALL[0] TCC_TOO_MANY_EA_WRREQS_STALL[0] TCC_WRITE[0] TCC_TAG_STALL[1] TCC_TOO_MANY_EA_WRREQS_STALL[1] TCC_WRITE[1] TCC_TAG_STALL[2] TCC_TOO_MANY_EA_WRREQS_STALL[2] TCC_WRITE[2] TCC_TAG_STALL[3] TCC_TOO_MANY_EA_WRREQS_STALL[3] TCC_WRITE[3] TCC_TAG_STALL[4] TCC_TOO_MANY_EA_WRREQS_STALL[4] TCC_WRITE[4] TCC_TAG_STALL[5] TCC_TOO_MANY_EA_WRREQS_STALL[5] TCC_WRITE[5] TCC_TAG_STALL[6] TCC_TOO_MANY_EA_WRREQS_STALL[6] TCC_WRITE[6] TCC_TAG_STALL[7] TCC_TOO_MANY_EA_WRREQS_STALL[7] TCC_WRITE[7] TCC_TAG_STALL[8] TCC_TOO_MANY_EA_WRREQS_STALL[8] TCC_WRITE[8] TCC_TAG_STALL[9] TCC_TOO_MANY_EA_WRREQS_STALL[9] TCC_WRITE[9] TCC_TAG_STALL[10] TCC_TOO_MANY_EA_WRREQS_STALL[10] TCC_WRITE[10] TCC_TAG_STALL[11] TCC_TOO_MANY_EA_WRREQS_STALL[11] TCC_WRITE[11] TCC_TAG_STALL[12] TCC_TOO_MANY_EA_WRREQS_STALL[12] TCC_WRITE[12] TCC_TAG_STALL[13] TCC_TOO_MANY_EA_WRREQS_STALL[13] TCC_WRITE[13] TCC_TAG_STALL[14] TCC_TOO_MANY_EA_WRREQS_STALL[14] TCC_WRITE[14] TCC_TAG_STALL[15] TCC_TOO_MANY_EA_WRREQS_STALL[15] TCC_WRITE[15]
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VALU_TRANS_F16 SQ_INSTS_VALU_ADD_F32 SQ_INSTS_VALU_MUL_F32 SQ_INSTS_VALU_FMA_F32 SQ_INSTS_VALU_TRANS_F32 SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MUL_F64 SQ_INSTS_VALU_FMA_F64 TCP_VOLATILE_sum TCP_TOTAL_ACCESSES_sum TCP_TOTAL_READ_sum TCP_TOTAL_WRITE_sum TA_BUFFER_ATOMIC_WAVEFRONTS_sum TA_BUFFER_TOTAL_CYCLES_sum TD_ATOMIC_WAVEFRONT_sum TD_STORE_WAVEFRONT_sum SPI_RA_REQ_NO_ALLOC SPI_RA_REQ_NO_ALLOC_CSN CPC_CPC_STAT_STALL CPC_UTCL1_STALL_ON_TRANSLATION CPF_CPF_STAT_IDLE CPF_CPF_TCIU_IDLE TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VALU_TRANS_F64 SQ_INSTS_VALU_INT32 SQ_INSTS_VALU_INT64 SQ_INSTS_SMEM SQ_INSTS_FLAT SQ_INSTS_LDS SQ_INSTS_GDS SQ_INSTS_EXP_GDS TCP_TOTAL_ATOMIC_WITH_RET_sum TCP_TOTAL_ATOMIC_WITHOUT_RET_sum TCP_TOTAL_WRITEBACK_INVALIDATES_sum TCP_TOTAL_CACHE_ACCESSES_sum TA_BUFFER_COALESCED_READ_CYCLES_sum TA_BUFFER_COALESCED_WRITE_CYCLES_sum TD_COALESCABLE_WAVEFRONT_sum SPI_RA_RES_STALL_CSN SPI_RA_TMP_STALL_CSN CPC_CPC_UTCL2IU_BUSY CPC_CPC_UTCL2IU_IDLE CPF_CMP_UTCL1_STALL_ON_TRANSLATION TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_BRANCH SQ_INSTS_SENDMSG SQ_WAIT_ANY SQ_WAIT_INST_ANY SQ_ACTIVE_INST_ANY SQ_ACTIVE_INST_VMEM SQ_ACTIVE_INST_LDS SQ_ACTIVE_INST_VALU TCP_UTCL1_TRANSLATION_MISS_sum TCP_UTCL1_TRANSLATION_HIT_sum TCP_UTCL1_PERMISSION_MISS_sum TCP_UTCL1_REQUEST_sum TA_ADDR_STALLED_BY_TC_CYCLES_sum TA_TOTAL_WAVEFRONTS_sum SPI_RA_WAVE_SIMD_FULL_CSN SPI_RA_VGPR_SIMD_FULL_CSN CPC_CPC_UTCL2IU_STALL CPC_ME1_BUSY_FOR_PACKET_DECODE TCC_EA0_WRREQ_sum TCC_EA0_WRREQ_64B_sum TCC_EA0_WR_UNCACHED_32B_sum TCC_EA0_WRREQ_DRAM_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_ACTIVE_INST_SCA SQ_ACTIVE_INST_EXP_GDS SQ_ACTIVE_INST_MISC SQ_ACTIVE_INST_FLAT SQ_INST_CYCLES_VMEM_WR SQ_INST_CYCLES_VMEM_RD SQ_INST_CYCLES_SMEM SQ_INST_CYCLES_SALU TCP_TCC_READ_REQ_sum TCP_TCC_WRITE_REQ_sum TCP_TCC_ATOMIC_WITH_RET_REQ_sum TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum TA_ADDR_STALLED_BY_TD_CYCLES_sum TA_DATA_STALLED_BY_TC_CYCLES_sum SPI_RA_SGPR_SIMD_FULL_CSN SPI_RA_LDS_CU_FULL_CSN CPC_ME1_DC0_SPI_BUSY TCC_EA0_RDREQ_sum TCC_EA0_RDREQ_32B_sum TCC_BUBBLE_sum TCC_EA0_RD_UNCACHED_32B_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_THREAD_CYCLES_VALU SQ_IFETCH SQ_LDS_BANK_CONFLICT SQ_LDS_ADDR_CONFLICT SQ_LDS_UNALIGNED_STALL SQ_WAVES_EQ_64 SQ_WAVES_LT_64 SQ_WAVES_LT_48 TCP_TCC_NC_READ_REQ_sum TCP_TCC_NC_WRITE_REQ_sum TCP_TCC_NC_ATOMIC_REQ_sum TCP_TCC_UC_READ_REQ_sum TA_FLAT_WAVEFRONTS_sum TA_FLAT_READ_WAVEFRONTS_sum SPI_RA_BAR_CU_FULL_CSN SPI_RA_TGLIM_CU_FULL_CSN TCC_EA0_RDREQ_DRAM_sum TCC_TAG_STALL_sum TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_WAVES_LT_32 SQ_WAVES_LT_16 SQ_ITEMS SQ_LDS_MEM_VIOLATIONS SQ_LDS_ATOMIC_RETURN SQ_LDS_IDX_ACTIVE SQ_WAVES_RESTORED SQ_WAVES_SAVED TCP_TCC_UC_WRITE_REQ_sum TCP_TCC_UC_ATOMIC_REQ_sum TCP_TCC_CC_READ_REQ_sum TCP_TCC_CC_WRITE_REQ_sum TA_FLAT_WRITE_WAVEFRONTS_sum TA_FLAT_ATOMIC_WAVEFRONTS_sum SPI_RA_WVLIM_STALL_CSN SPI_SWC_CSC_WR TCC_NORMAL_EVICT_sum TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA0_ATOMIC_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_SMEM_NORM SQ_INSTS_MFMA SQ_INSTS_VALU_MFMA_I8 SQ_INSTS_VALU_MFMA_F16 SQ_INSTS_VALU_MFMA_BF16 SQ_INSTS_VALU_MFMA_F32 SQ_INSTS_VALU_MFMA_F64 SQ_VALU_MFMA_BUSY_CYCLES TCP_TCC_CC_ATOMIC_REQ_sum TCP_TCC_RW_READ_REQ_sum TCP_TCC_RW_WRITE_REQ_sum TCP_TCC_RW_ATOMIC_REQ_sum SPI_VWC_CSC_WR SPI_RA_BULKY_CU_FULL_CSN TCC_EA0_RDREQ_LEVEL_sum TCC_EA0_WRREQ_LEVEL_sum TCC_EA0_ATOMIC_LEVEL_sum TCC_EA0_WRREQ_STALL_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc: SQ_INSTS_VALU_MFMA_MOPS_I8 SQ_INSTS_VALU_MFMA_MOPS_F16 SQ_INSTS_VALU_MFMA_MOPS_BF16 SQ_INSTS_VALU_MFMA_MOPS_F32 SQ_INSTS_VALU_MFMA_MOPS_F64 SQC_TC_INST_REQ SQC_TC_DATA_READ_REQ SQC_TC_DATA_WRITE_REQ TCP_PENDING_STALL_CYCLES_sum
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
@@ -0,0 +1,5 @@
|
||||
pmc:
|
||||
|
||||
gpu:
|
||||
range:
|
||||
kernel:
|
||||
Las diferiencias del archivo han sido suprimidas porque una o mas lineas son muy largas
@@ -0,0 +1,2 @@
|
||||
workload_name,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_model,gpu_arch,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,hbm_bw,num_xcd
|
||||
vcopy,./tests/vcopy -n 1048576 -b 256 -i 3,SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Thu 30 May 2024 02:19:39 PM (CDT),2,splinter-126-wr-c6,AMD Ryzen 9 7950X 16-Core Processor,"American Megatrends International, LLC.VS2683299N.FD",Ubuntu 22.04.4 LTS,5.18.2-mi300-build-140423-ubuntu-22.04+,,114656528,,6.2.0-13611,113-MI3SRIOV-001,SPX,NPS1,MI300X_A1,gfx942,32,4096,304,4,32,64,1024,32,2100,1300,2100,1300,128,32,160,4,5324.8,8
|
||||
|
@@ -0,0 +1,4 @@
|
||||
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID
|
||||
1,60633,1,1103031,1103031,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0
|
||||
2,60633,1,1103031,1103031,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0
|
||||
3,60633,1,1103031,1103031,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0
|
||||
|
Referencia en una nueva incidencia
Block a user