Add vcopy workloads for MI300A and MI300X

Signed-off-by: coleramos425 <colramos@amd.com>


[ROCm/rocprofiler-compute commit: c079f1330b]
Este commit está contenido en:
coleramos425
2024-05-30 19:25:26 +00:00
cometido por Cole Ramos
padre 34b289e66d
commit 2bc71cc0d8
Se han modificado 66 ficheros con 712 adiciones y 0 borrados
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,SQ_WAVES,SQ_IFETCH,SQ_IFETCH_LEVEL,SQ_ACCUM_PREV_HIRES
0,11995,1,239059,239059,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,208371.0,208371.0,16384.0,65536.0,27933.0,2268960.0
1,11995,1,239059,239059,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,200939.0,200939.0,16384.0,65536.0,13020.0,1048740.0
2,11995,1,239059,239059,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,177122.0,177122.0,16384.0,65536.0,13215.0,1049244.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_IFETCH SQ_IFETCH_LEVEL SQ_ACCUM_PREV_HIRES
2 0 11995 1 239059 239059 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504048950 162067504056482 0 208371.0 208371.0 16384.0 65536.0 27933.0 2268960.0
3 1 11995 1 239059 239059 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504097624 162067504103513 0 200939.0 200939.0 16384.0 65536.0 13020.0 1048740.0
4 2 11995 1 239059 239059 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504075511 162067504081560 0 177122.0 177122.0 16384.0 65536.0 13215.0 1049244.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_LDS,SQ_INST_LEVEL_LDS,SQ_ACCUM_PREV_HIRES
0,11995,1,239070,239070,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,0.0,0.0,0.0
1,11995,1,239070,239070,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,0.0,0.0,0.0
2,11995,1,239070,239070,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,0.0,0.0,0.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID SQ_INSTS_LDS SQ_INST_LEVEL_LDS SQ_ACCUM_PREV_HIRES
2 0 11995 1 239070 239070 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504048950 162067504056482 0 0.0 0.0 0.0
3 1 11995 1 239070 239070 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504097624 162067504103513 0 0.0 0.0 0.0
4 2 11995 1 239070 239070 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504075511 162067504081560 0 0.0 0.0 0.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_SMEM,SQ_INST_LEVEL_SMEM,SQ_ACCUM_PREV_HIRES
0,11995,1,239081,239081,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,65536.0,282426.0,22607296.0
1,11995,1,239081,239081,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,65536.0,262012.0,20878056.0
2,11995,1,239081,239081,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,65536.0,221574.0,17737216.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID SQ_INSTS_SMEM SQ_INST_LEVEL_SMEM SQ_ACCUM_PREV_HIRES
2 0 11995 1 239081 239081 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504048950 162067504056482 0 65536.0 282426.0 22607296.0
3 1 11995 1 239081 239081 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504097624 162067504103513 0 65536.0 262012.0 20878056.0
4 2 11995 1 239081 239081 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504075511 162067504081560 0 65536.0 221574.0 17737216.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_VMEM,SQ_INST_LEVEL_VMEM,SQ_ACCUM_PREV_HIRES
0,11995,1,239092,239092,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,32768.0,525409.0,42022704.0
1,11995,1,239092,239092,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,32768.0,405541.0,32437816.0
2,11995,1,239092,239092,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,32768.0,414783.0,33168224.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID SQ_INSTS_VMEM SQ_INST_LEVEL_VMEM SQ_ACCUM_PREV_HIRES
2 0 11995 1 239092 239092 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504048950 162067504056482 0 32768.0 525409.0 42022704.0
3 1 11995 1 239092 239092 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504097624 162067504103513 0 32768.0 405541.0 32437816.0
4 2 11995 1 239092 239092 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504075511 162067504081560 0 32768.0 414783.0 33168224.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,CPC_ME1_BUSY_FOR_PACKET_DECODE,SQ_CYCLES,SQ_WAVES,SQ_WAVE_CYCLES,SQ_BUSY_CYCLES,SQ_LEVEL_WAVES,SQ_ACCUM_PREV_HIRES
0,11995,1,239103,239103,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0,220553.0,220553.0,125881.0,882212.0,16384.0,13496981.0,243456.0,0.0,54413320.0
1,11995,1,239103,239103,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0,186745.0,186745.0,104964.0,746980.0,16384.0,10050383.0,189145.0,0.0,40592236.0
2,11995,1,239103,239103,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0,177319.0,177319.0,96370.0,709276.0,16384.0,9944060.0,186126.0,0.0,40164888.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID GRBM_COUNT GRBM_GUI_ACTIVE CPC_ME1_BUSY_FOR_PACKET_DECODE SQ_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_BUSY_CYCLES SQ_LEVEL_WAVES SQ_ACCUM_PREV_HIRES
2 0 11995 1 239103 239103 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504048950 162067504056482 0 220553.0 220553.0 125881.0 882212.0 16384.0 13496981.0 243456.0 0.0 54413320.0
3 1 11995 1 239103 239103 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504097624 162067504103513 0 186745.0 186745.0 104964.0 746980.0 16384.0 10050383.0 189145.0 0.0 40592236.0
4 2 11995 1 239103 239103 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504075511 162067504081560 0 177319.0 177319.0 96370.0 709276.0 16384.0 9944060.0 186126.0 0.0 40164888.0
@@ -0,0 +1,251 @@
Omniperf version: 2.0.0
Profiler choice: rocprofv2
Path: /home/colramos/omniperf/tests/workloads/vcopy/MI300A_A1
Target: MI300A_A1
Command: tests/vcopy -n 1048576 -b 256 -i 3
Kernel Selection: None
Dispatch Selection: None
Hardware Blocks: All
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Collecting Performance Counters
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_IFETCH_LEVEL.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH_LEVEL
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_INST_LEVEL_LDS.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_LDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_LDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_INST_LEVEL_SMEM.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_SMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_INST_LEVEL_VMEM.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/SQ_LEVEL_WAVES.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|-> [/opt/rocm/bin/rocprofv2] - CPC_ME1_BUSY_FOR_PACKET_DECODE
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVE_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CYCLES
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_0.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CU_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVE_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_CVT
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM_WR
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_1.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SALU
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VSKIPPED
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F16
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_10.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_DATA_ATOMIC_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_STALL
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_16
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_HITS
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_MISSES
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_MISSES_DUPLICATE
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_11.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_INPUT_VALID_READYB
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_ATOMIC
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_8
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_HITS
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_MISSES
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_MISSES_DUPLICATE
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_1
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_12.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_2
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_4
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
|-> [/opt/rocm/bin/rocprofv2] sw thinks it moved 1.000000 KB per wave
|-> [/opt/rocm/bin/rocprofv2] Total threads: 1048576, Grid Size: 4096 block Size:256, Wavefronts:16384:
|-> [/opt/rocm/bin/rocprofv2] Launching the kernel on the GPU
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_13.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_BUBBLE[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_CYCLE[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_BUBBLE[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_CYCLE[1]
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_14.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[3]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[3]
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_15.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[2]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[3]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[3]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[3]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[3]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[4]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[4]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[4]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[4]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[5]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[5]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[5]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[5]
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_16.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_MISS[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_READ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_REQ[0]
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_17.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_TOO_MANY_EA_WRREQS_STALL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_WRITE[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_TOO_MANY_EA_WRREQS_STALL[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_WRITE[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[2]
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_2.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_FMA_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F64
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F64
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_3.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F64
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT64
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_FLAT
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_LDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_GDS
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_4.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_BRANCH
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SENDMSG
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_ANY
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_INST_ANY
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_ANY
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_LDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_VALU
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_5.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_SCA
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_EXP_GDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_MISC
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_6.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_THREAD_CYCLES_VALU
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_BANK_CONFLICT
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_ADDR_CONFLICT
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_UNALIGNED_STALL
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_EQ_64
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_64
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_48
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_7.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_32
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_16
|-> [/opt/rocm/bin/rocprofv2] - SQ_ITEMS
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_MEM_VIOLATIONS
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_ATOMIC_RETURN
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_8.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM_NORM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_MFMA
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_I8
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_BF16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F64
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/pmc_perf_9.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_I8
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_BF16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F64
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_INST_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_DATA_READ_REQ
[profiling] Current input file: tests/workloads/vcopy/MI300A_A1/perfmon/timestamps.txt
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
[roofline] Roofline temporarily disabled in MI300
@@ -0,0 +1,5 @@
pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_IFETCH SQ_IFETCH_LEVEL SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_LDS SQ_INST_LEVEL_LDS SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_SMEM SQ_INST_LEVEL_SMEM SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VMEM SQ_INST_LEVEL_VMEM SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: GRBM_COUNT GRBM_GUI_ACTIVE CPC_ME1_BUSY_FOR_PACKET_DECODE SQ_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_BUSY_CYCLES SQ_LEVEL_WAVES SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_CYCLES SQ_BUSY_CYCLES SQ_BUSY_CU_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_INSTS_VALU_CVT SQ_INSTS_VMEM_WR SQ_INSTS_VMEM_RD GRBM_COUNT GRBM_GUI_ACTIVE TCP_GATE_EN1_sum TCP_GATE_EN2_sum TCP_TD_TCP_STALL_CYCLES_sum TCP_TCR_TCP_STALL_CYCLES_sum TA_TA_BUSY_sum TA_BUFFER_WAVEFRONTS_sum TD_TD_BUSY_sum TD_TC_STALL_sum SPI_CSN_WINDOW_VALID SPI_CSN_BUSY CPC_CPC_STAT_BUSY CPC_CPC_STAT_IDLE CPF_CPF_STAT_BUSY CPF_CPF_STAT_STALL TCC_CYCLE_sum TCC_BUSY_sum TCC_PROBE_sum TCC_PROBE_ALL_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VMEM SQ_INSTS_SALU SQ_INSTS_VSKIPPED SQ_INSTS SQ_INSTS_VALU SQ_INSTS_VALU_ADD_F16 SQ_INSTS_VALU_MUL_F16 SQ_INSTS_VALU_FMA_F16 GRBM_SPI_BUSY TCP_READ_TAGCONFLICT_STALL_CYCLES_sum TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum TCP_TA_TCP_STATE_READ_sum TA_BUFFER_READ_WAVEFRONTS_sum TA_BUFFER_WRITE_WAVEFRONTS_sum TD_SPI_STALL_sum TD_LOAD_WAVEFRONT_sum SPI_CSN_NUM_THREADGROUPS SPI_CSN_WAVE CPC_CPC_TCIU_BUSY CPC_CPC_TCIU_IDLE CPF_CPF_TCIU_BUSY CPF_CPF_TCIU_STALL TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum TCC_RW_REQ_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_TC_DATA_ATOMIC_REQ SQC_TC_STALL SQC_TC_REQ SQC_DCACHE_REQ_READ_16 SQC_ICACHE_REQ SQC_ICACHE_HITS SQC_ICACHE_MISSES SQC_ICACHE_MISSES_DUPLICATE
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_DCACHE_INPUT_VALID_READYB SQC_DCACHE_ATOMIC SQC_DCACHE_REQ_READ_8 SQC_DCACHE_REQ SQC_DCACHE_HITS SQC_DCACHE_MISSES SQC_DCACHE_MISSES_DUPLICATE SQC_DCACHE_REQ_READ_1
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_DCACHE_REQ_READ_2 SQC_DCACHE_REQ_READ_4
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_ATOMIC[0] TCC_BUBBLE[0] TCC_CYCLE[0] TCC_EA0_ATOMIC[0] TCC_ATOMIC[1] TCC_BUBBLE[1] TCC_CYCLE[1] TCC_EA0_ATOMIC[1] TCC_ATOMIC[2] TCC_BUBBLE[2] TCC_CYCLE[2] TCC_EA0_ATOMIC[2] TCC_ATOMIC[3] TCC_BUBBLE[3] TCC_CYCLE[3] TCC_EA0_ATOMIC[3] TCC_ATOMIC[4] TCC_BUBBLE[4] TCC_CYCLE[4] TCC_EA0_ATOMIC[4] TCC_ATOMIC[5] TCC_BUBBLE[5] TCC_CYCLE[5] TCC_EA0_ATOMIC[5] TCC_ATOMIC[6] TCC_BUBBLE[6] TCC_CYCLE[6] TCC_EA0_ATOMIC[6] TCC_ATOMIC[7] TCC_BUBBLE[7] TCC_CYCLE[7] TCC_EA0_ATOMIC[7] TCC_ATOMIC[8] TCC_BUBBLE[8] TCC_CYCLE[8] TCC_EA0_ATOMIC[8] TCC_ATOMIC[9] TCC_BUBBLE[9] TCC_CYCLE[9] TCC_EA0_ATOMIC[9] TCC_ATOMIC[10] TCC_BUBBLE[10] TCC_CYCLE[10] TCC_EA0_ATOMIC[10] TCC_ATOMIC[11] TCC_BUBBLE[11] TCC_CYCLE[11] TCC_EA0_ATOMIC[11] TCC_ATOMIC[12] TCC_BUBBLE[12] TCC_CYCLE[12] TCC_EA0_ATOMIC[12] TCC_ATOMIC[13] TCC_BUBBLE[13] TCC_CYCLE[13] TCC_EA0_ATOMIC[13] TCC_ATOMIC[14] TCC_BUBBLE[14] TCC_CYCLE[14] TCC_EA0_ATOMIC[14] TCC_ATOMIC[15] TCC_BUBBLE[15] TCC_CYCLE[15] TCC_EA0_ATOMIC[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_EA0_ATOMIC_LEVEL[0] TCC_EA0_RDREQ[0] TCC_EA0_RDREQ_32B[0] TCC_EA0_RDREQ_LEVEL[0] TCC_EA0_ATOMIC_LEVEL[1] TCC_EA0_RDREQ[1] TCC_EA0_RDREQ_32B[1] TCC_EA0_RDREQ_LEVEL[1] TCC_EA0_ATOMIC_LEVEL[2] TCC_EA0_RDREQ[2] TCC_EA0_RDREQ_32B[2] TCC_EA0_RDREQ_LEVEL[2] TCC_EA0_ATOMIC_LEVEL[3] TCC_EA0_RDREQ[3] TCC_EA0_RDREQ_32B[3] TCC_EA0_RDREQ_LEVEL[3] TCC_EA0_ATOMIC_LEVEL[4] TCC_EA0_RDREQ[4] TCC_EA0_RDREQ_32B[4] TCC_EA0_RDREQ_LEVEL[4] TCC_EA0_ATOMIC_LEVEL[5] TCC_EA0_RDREQ[5] TCC_EA0_RDREQ_32B[5] TCC_EA0_RDREQ_LEVEL[5] TCC_EA0_ATOMIC_LEVEL[6] TCC_EA0_RDREQ[6] TCC_EA0_RDREQ_32B[6] TCC_EA0_RDREQ_LEVEL[6] TCC_EA0_ATOMIC_LEVEL[7] TCC_EA0_RDREQ[7] TCC_EA0_RDREQ_32B[7] TCC_EA0_RDREQ_LEVEL[7] TCC_EA0_ATOMIC_LEVEL[8] TCC_EA0_RDREQ[8] TCC_EA0_RDREQ_32B[8] TCC_EA0_RDREQ_LEVEL[8] TCC_EA0_ATOMIC_LEVEL[9] TCC_EA0_RDREQ[9] TCC_EA0_RDREQ_32B[9] TCC_EA0_RDREQ_LEVEL[9] TCC_EA0_ATOMIC_LEVEL[10] TCC_EA0_RDREQ[10] TCC_EA0_RDREQ_32B[10] TCC_EA0_RDREQ_LEVEL[10] TCC_EA0_ATOMIC_LEVEL[11] TCC_EA0_RDREQ[11] TCC_EA0_RDREQ_32B[11] TCC_EA0_RDREQ_LEVEL[11] TCC_EA0_ATOMIC_LEVEL[12] TCC_EA0_RDREQ[12] TCC_EA0_RDREQ_32B[12] TCC_EA0_RDREQ_LEVEL[12] TCC_EA0_ATOMIC_LEVEL[13] TCC_EA0_RDREQ[13] TCC_EA0_RDREQ_32B[13] TCC_EA0_RDREQ_LEVEL[13] TCC_EA0_ATOMIC_LEVEL[14] TCC_EA0_RDREQ[14] TCC_EA0_RDREQ_32B[14] TCC_EA0_RDREQ_LEVEL[14] TCC_EA0_ATOMIC_LEVEL[15] TCC_EA0_RDREQ[15] TCC_EA0_RDREQ_32B[15] TCC_EA0_RDREQ_LEVEL[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_EA0_WRREQ[0] TCC_EA0_WRREQ_64B[0] TCC_EA0_WRREQ_LEVEL[0] TCC_HIT[0] TCC_EA0_WRREQ[1] TCC_EA0_WRREQ_64B[1] TCC_EA0_WRREQ_LEVEL[1] TCC_HIT[1] TCC_EA0_WRREQ[2] TCC_EA0_WRREQ_64B[2] TCC_EA0_WRREQ_LEVEL[2] TCC_HIT[2] TCC_EA0_WRREQ[3] TCC_EA0_WRREQ_64B[3] TCC_EA0_WRREQ_LEVEL[3] TCC_HIT[3] TCC_EA0_WRREQ[4] TCC_EA0_WRREQ_64B[4] TCC_EA0_WRREQ_LEVEL[4] TCC_HIT[4] TCC_EA0_WRREQ[5] TCC_EA0_WRREQ_64B[5] TCC_EA0_WRREQ_LEVEL[5] TCC_HIT[5] TCC_EA0_WRREQ[6] TCC_EA0_WRREQ_64B[6] TCC_EA0_WRREQ_LEVEL[6] TCC_HIT[6] TCC_EA0_WRREQ[7] TCC_EA0_WRREQ_64B[7] TCC_EA0_WRREQ_LEVEL[7] TCC_HIT[7] TCC_EA0_WRREQ[8] TCC_EA0_WRREQ_64B[8] TCC_EA0_WRREQ_LEVEL[8] TCC_HIT[8] TCC_EA0_WRREQ[9] TCC_EA0_WRREQ_64B[9] TCC_EA0_WRREQ_LEVEL[9] TCC_HIT[9] TCC_EA0_WRREQ[10] TCC_EA0_WRREQ_64B[10] TCC_EA0_WRREQ_LEVEL[10] TCC_HIT[10] TCC_EA0_WRREQ[11] TCC_EA0_WRREQ_64B[11] TCC_EA0_WRREQ_LEVEL[11] TCC_HIT[11] TCC_EA0_WRREQ[12] TCC_EA0_WRREQ_64B[12] TCC_EA0_WRREQ_LEVEL[12] TCC_HIT[12] TCC_EA0_WRREQ[13] TCC_EA0_WRREQ_64B[13] TCC_EA0_WRREQ_LEVEL[13] TCC_HIT[13] TCC_EA0_WRREQ[14] TCC_EA0_WRREQ_64B[14] TCC_EA0_WRREQ_LEVEL[14] TCC_HIT[14] TCC_EA0_WRREQ[15] TCC_EA0_WRREQ_64B[15] TCC_EA0_WRREQ_LEVEL[15] TCC_HIT[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_MISS[0] TCC_READ[0] TCC_REQ[0] TCC_RW_REQ[0] TCC_MISS[1] TCC_READ[1] TCC_REQ[1] TCC_RW_REQ[1] TCC_MISS[2] TCC_READ[2] TCC_REQ[2] TCC_RW_REQ[2] TCC_MISS[3] TCC_READ[3] TCC_REQ[3] TCC_RW_REQ[3] TCC_MISS[4] TCC_READ[4] TCC_REQ[4] TCC_RW_REQ[4] TCC_MISS[5] TCC_READ[5] TCC_REQ[5] TCC_RW_REQ[5] TCC_MISS[6] TCC_READ[6] TCC_REQ[6] TCC_RW_REQ[6] TCC_MISS[7] TCC_READ[7] TCC_REQ[7] TCC_RW_REQ[7] TCC_MISS[8] TCC_READ[8] TCC_REQ[8] TCC_RW_REQ[8] TCC_MISS[9] TCC_READ[9] TCC_REQ[9] TCC_RW_REQ[9] TCC_MISS[10] TCC_READ[10] TCC_REQ[10] TCC_RW_REQ[10] TCC_MISS[11] TCC_READ[11] TCC_REQ[11] TCC_RW_REQ[11] TCC_MISS[12] TCC_READ[12] TCC_REQ[12] TCC_RW_REQ[12] TCC_MISS[13] TCC_READ[13] TCC_REQ[13] TCC_RW_REQ[13] TCC_MISS[14] TCC_READ[14] TCC_REQ[14] TCC_RW_REQ[14] TCC_MISS[15] TCC_READ[15] TCC_REQ[15] TCC_RW_REQ[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_TAG_STALL[0] TCC_TOO_MANY_EA_WRREQS_STALL[0] TCC_WRITE[0] TCC_TAG_STALL[1] TCC_TOO_MANY_EA_WRREQS_STALL[1] TCC_WRITE[1] TCC_TAG_STALL[2] TCC_TOO_MANY_EA_WRREQS_STALL[2] TCC_WRITE[2] TCC_TAG_STALL[3] TCC_TOO_MANY_EA_WRREQS_STALL[3] TCC_WRITE[3] TCC_TAG_STALL[4] TCC_TOO_MANY_EA_WRREQS_STALL[4] TCC_WRITE[4] TCC_TAG_STALL[5] TCC_TOO_MANY_EA_WRREQS_STALL[5] TCC_WRITE[5] TCC_TAG_STALL[6] TCC_TOO_MANY_EA_WRREQS_STALL[6] TCC_WRITE[6] TCC_TAG_STALL[7] TCC_TOO_MANY_EA_WRREQS_STALL[7] TCC_WRITE[7] TCC_TAG_STALL[8] TCC_TOO_MANY_EA_WRREQS_STALL[8] TCC_WRITE[8] TCC_TAG_STALL[9] TCC_TOO_MANY_EA_WRREQS_STALL[9] TCC_WRITE[9] TCC_TAG_STALL[10] TCC_TOO_MANY_EA_WRREQS_STALL[10] TCC_WRITE[10] TCC_TAG_STALL[11] TCC_TOO_MANY_EA_WRREQS_STALL[11] TCC_WRITE[11] TCC_TAG_STALL[12] TCC_TOO_MANY_EA_WRREQS_STALL[12] TCC_WRITE[12] TCC_TAG_STALL[13] TCC_TOO_MANY_EA_WRREQS_STALL[13] TCC_WRITE[13] TCC_TAG_STALL[14] TCC_TOO_MANY_EA_WRREQS_STALL[14] TCC_WRITE[14] TCC_TAG_STALL[15] TCC_TOO_MANY_EA_WRREQS_STALL[15] TCC_WRITE[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_TRANS_F16 SQ_INSTS_VALU_ADD_F32 SQ_INSTS_VALU_MUL_F32 SQ_INSTS_VALU_FMA_F32 SQ_INSTS_VALU_TRANS_F32 SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MUL_F64 SQ_INSTS_VALU_FMA_F64 TCP_VOLATILE_sum TCP_TOTAL_ACCESSES_sum TCP_TOTAL_READ_sum TCP_TOTAL_WRITE_sum TA_BUFFER_ATOMIC_WAVEFRONTS_sum TA_BUFFER_TOTAL_CYCLES_sum TD_ATOMIC_WAVEFRONT_sum TD_STORE_WAVEFRONT_sum SPI_RA_REQ_NO_ALLOC SPI_RA_REQ_NO_ALLOC_CSN CPC_CPC_STAT_STALL CPC_UTCL1_STALL_ON_TRANSLATION CPF_CPF_STAT_IDLE CPF_CPF_TCIU_IDLE TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_TRANS_F64 SQ_INSTS_VALU_INT32 SQ_INSTS_VALU_INT64 SQ_INSTS_SMEM SQ_INSTS_FLAT SQ_INSTS_LDS SQ_INSTS_GDS SQ_INSTS_EXP_GDS TCP_TOTAL_ATOMIC_WITH_RET_sum TCP_TOTAL_ATOMIC_WITHOUT_RET_sum TCP_TOTAL_WRITEBACK_INVALIDATES_sum TCP_TOTAL_CACHE_ACCESSES_sum TA_BUFFER_COALESCED_READ_CYCLES_sum TA_BUFFER_COALESCED_WRITE_CYCLES_sum TD_COALESCABLE_WAVEFRONT_sum SPI_RA_RES_STALL_CSN SPI_RA_TMP_STALL_CSN CPC_CPC_UTCL2IU_BUSY CPC_CPC_UTCL2IU_IDLE CPF_CMP_UTCL1_STALL_ON_TRANSLATION TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_BRANCH SQ_INSTS_SENDMSG SQ_WAIT_ANY SQ_WAIT_INST_ANY SQ_ACTIVE_INST_ANY SQ_ACTIVE_INST_VMEM SQ_ACTIVE_INST_LDS SQ_ACTIVE_INST_VALU TCP_UTCL1_TRANSLATION_MISS_sum TCP_UTCL1_TRANSLATION_HIT_sum TCP_UTCL1_PERMISSION_MISS_sum TCP_UTCL1_REQUEST_sum TA_ADDR_STALLED_BY_TC_CYCLES_sum TA_TOTAL_WAVEFRONTS_sum SPI_RA_WAVE_SIMD_FULL_CSN SPI_RA_VGPR_SIMD_FULL_CSN CPC_CPC_UTCL2IU_STALL CPC_ME1_BUSY_FOR_PACKET_DECODE TCC_EA0_WRREQ_sum TCC_EA0_WRREQ_64B_sum TCC_EA0_WR_UNCACHED_32B_sum TCC_EA0_WRREQ_DRAM_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_ACTIVE_INST_SCA SQ_ACTIVE_INST_EXP_GDS SQ_ACTIVE_INST_MISC SQ_ACTIVE_INST_FLAT SQ_INST_CYCLES_VMEM_WR SQ_INST_CYCLES_VMEM_RD SQ_INST_CYCLES_SMEM SQ_INST_CYCLES_SALU TCP_TCC_READ_REQ_sum TCP_TCC_WRITE_REQ_sum TCP_TCC_ATOMIC_WITH_RET_REQ_sum TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum TA_ADDR_STALLED_BY_TD_CYCLES_sum TA_DATA_STALLED_BY_TC_CYCLES_sum SPI_RA_SGPR_SIMD_FULL_CSN SPI_RA_LDS_CU_FULL_CSN CPC_ME1_DC0_SPI_BUSY TCC_EA0_RDREQ_sum TCC_EA0_RDREQ_32B_sum TCC_BUBBLE_sum TCC_EA0_RD_UNCACHED_32B_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_THREAD_CYCLES_VALU SQ_IFETCH SQ_LDS_BANK_CONFLICT SQ_LDS_ADDR_CONFLICT SQ_LDS_UNALIGNED_STALL SQ_WAVES_EQ_64 SQ_WAVES_LT_64 SQ_WAVES_LT_48 TCP_TCC_NC_READ_REQ_sum TCP_TCC_NC_WRITE_REQ_sum TCP_TCC_NC_ATOMIC_REQ_sum TCP_TCC_UC_READ_REQ_sum TA_FLAT_WAVEFRONTS_sum TA_FLAT_READ_WAVEFRONTS_sum SPI_RA_BAR_CU_FULL_CSN SPI_RA_TGLIM_CU_FULL_CSN TCC_EA0_RDREQ_DRAM_sum TCC_TAG_STALL_sum TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_WAVES_LT_32 SQ_WAVES_LT_16 SQ_ITEMS SQ_LDS_MEM_VIOLATIONS SQ_LDS_ATOMIC_RETURN SQ_LDS_IDX_ACTIVE SQ_WAVES_RESTORED SQ_WAVES_SAVED TCP_TCC_UC_WRITE_REQ_sum TCP_TCC_UC_ATOMIC_REQ_sum TCP_TCC_CC_READ_REQ_sum TCP_TCC_CC_WRITE_REQ_sum TA_FLAT_WRITE_WAVEFRONTS_sum TA_FLAT_ATOMIC_WAVEFRONTS_sum SPI_RA_WVLIM_STALL_CSN SPI_SWC_CSC_WR TCC_NORMAL_EVICT_sum TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA0_ATOMIC_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_SMEM_NORM SQ_INSTS_MFMA SQ_INSTS_VALU_MFMA_I8 SQ_INSTS_VALU_MFMA_F16 SQ_INSTS_VALU_MFMA_BF16 SQ_INSTS_VALU_MFMA_F32 SQ_INSTS_VALU_MFMA_F64 SQ_VALU_MFMA_BUSY_CYCLES TCP_TCC_CC_ATOMIC_REQ_sum TCP_TCC_RW_READ_REQ_sum TCP_TCC_RW_WRITE_REQ_sum TCP_TCC_RW_ATOMIC_REQ_sum SPI_VWC_CSC_WR SPI_RA_BULKY_CU_FULL_CSN TCC_EA0_RDREQ_LEVEL_sum TCC_EA0_WRREQ_LEVEL_sum TCC_EA0_ATOMIC_LEVEL_sum TCC_EA0_WRREQ_STALL_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_MFMA_MOPS_I8 SQ_INSTS_VALU_MFMA_MOPS_F16 SQ_INSTS_VALU_MFMA_MOPS_BF16 SQ_INSTS_VALU_MFMA_MOPS_F32 SQ_INSTS_VALU_MFMA_MOPS_F64 SQC_TC_INST_REQ SQC_TC_DATA_READ_REQ SQC_TC_DATA_WRITE_REQ TCP_PENDING_STALL_CYCLES_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc:
gpu:
range:
kernel:
Las diferiencias del archivo han sido suprimidas porque una o mas lineas son muy largas
@@ -0,0 +1,2 @@
workload_name,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_model,gpu_arch,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,hbm_bw,num_xcd
vcopy,tests/vcopy -n 1048576 -b 256 -i 3,SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Thu 30 May 2024 02:09:51 PM (CDT),2,sh5-1w300-rg3-3,AMD Instinct MI300A Accelerator,"American Megatrends International, LLC.RMO1002DS",Ubuntu 22.04.2 LTS,5.18.2-mi300-build-140423-ubuntu-22.04+,,131174852,,6.1.2-110,N/A,SPX,NPS1,MI300A_A1,gfx942,32,24576,228,4,24,64,1024,32,2100,1300,2100,1300,96,32,120,4,5324.8,6
1 workload_name command ip_blocks timestamp version hostname cpu_model sbios linux_distro linux_kernel_version amd_gpu_kernel_version cpu_memory gpu_memory rocm_version vbios compute_partition memory_partition gpu_model gpu_arch gpu_l1 gpu_l2 cu_per_gpu simd_per_cu se_per_gpu wave_size workgroup_max_size max_waves_per_cu max_sclk max_mclk cur_sclk cur_mclk total_l2_chan lds_banks_per_cu sqc_per_gpu pipes_per_gpu hbm_bw num_xcd
2 vcopy tests/vcopy -n 1048576 -b 256 -i 3 SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF Thu 30 May 2024 02:09:51 PM (CDT) 2 sh5-1w300-rg3-3 AMD Instinct MI300A Accelerator American Megatrends International, LLC.RMO1002DS Ubuntu 22.04.2 LTS 5.18.2-mi300-build-140423-ubuntu-22.04+ 131174852 6.1.2-110 N/A SPX NPS1 MI300A_A1 gfx942 32 24576 228 4 24 64 1024 32 2100 1300 2100 1300 96 32 120 4 5324.8 6
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID
1,11995,1,239313,239313,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504048950,162067504056482,0
3,11995,1,239313,239313,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504097624,162067504103513,0
2,11995,1,239313,239313,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",162067504075511,162067504081560,0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID
2 1 11995 1 239313 239313 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504048950 162067504056482 0
3 3 11995 1 239313 239313 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504097624 162067504103513 0
4 2 11995 1 239313 239313 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 162067504075511 162067504081560 0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,SQ_WAVES,SQ_IFETCH,SQ_IFETCH_LEVEL,SQ_ACCUM_PREV_HIRES
0,60633,1,1102641,1102641,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,449213.0,449213.0,16384.0,65536.0,40694.0,3249356.0
1,60633,1,1102641,1102641,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,421528.0,421528.0,16384.0,65536.0,13164.0,1048588.0
2,60633,1,1102641,1102641,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,431238.0,431238.0,16384.0,65536.0,13128.0,1048576.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_IFETCH SQ_IFETCH_LEVEL SQ_ACCUM_PREV_HIRES
2 0 60633 1 1102641 1102641 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676802805 810969676816525 0 449213.0 449213.0 16384.0 65536.0 40694.0 3249356.0
3 1 60633 1 1102641 1102641 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676837363 810969676849123 0 421528.0 421528.0 16384.0 65536.0 13164.0 1048588.0
4 2 60633 1 1102641 1102641 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676868522 810969676880402 0 431238.0 431238.0 16384.0 65536.0 13128.0 1048576.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_LDS,SQ_INST_LEVEL_LDS,SQ_ACCUM_PREV_HIRES
0,60633,1,1102667,1102667,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,0.0,0.0,0.0
1,60633,1,1102667,1102667,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,0.0,0.0,0.0
2,60633,1,1102667,1102667,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,0.0,0.0,0.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID SQ_INSTS_LDS SQ_INST_LEVEL_LDS SQ_ACCUM_PREV_HIRES
2 0 60633 1 1102667 1102667 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676802805 810969676816525 0 0.0 0.0 0.0
3 1 60633 1 1102667 1102667 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676837363 810969676849123 0 0.0 0.0 0.0
4 2 60633 1 1102667 1102667 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676868522 810969676880402 0 0.0 0.0 0.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_SMEM,SQ_INST_LEVEL_SMEM,SQ_ACCUM_PREV_HIRES
0,60633,1,1102685,1102685,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,65536.0,3692420.0,295366648.0
1,60633,1,1102685,1102685,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,65536.0,3741276.0,299325624.0
2,60633,1,1102685,1102685,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,65536.0,3495126.0,279581920.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID SQ_INSTS_SMEM SQ_INST_LEVEL_SMEM SQ_ACCUM_PREV_HIRES
2 0 60633 1 1102685 1102685 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676802805 810969676816525 0 65536.0 3692420.0 295366648.0
3 1 60633 1 1102685 1102685 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676837363 810969676849123 0 65536.0 3741276.0 299325624.0
4 2 60633 1 1102685 1102685 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676868522 810969676880402 0 65536.0 3495126.0 279581920.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,SQ_INSTS_VMEM,SQ_INST_LEVEL_VMEM,SQ_ACCUM_PREV_HIRES
0,60633,1,1102722,1102722,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,32768.0,504073.0,40323912.0
1,60633,1,1102722,1102722,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,32768.0,369531.0,29558556.0
2,60633,1,1102722,1102722,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,32768.0,402918.0,32234036.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID SQ_INSTS_VMEM SQ_INST_LEVEL_VMEM SQ_ACCUM_PREV_HIRES
2 0 60633 1 1102722 1102722 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676802805 810969676816525 0 32768.0 504073.0 40323912.0
3 1 60633 1 1102722 1102722 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676837363 810969676849123 0 32768.0 369531.0 29558556.0
4 2 60633 1 1102722 1102722 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676868522 810969676880402 0 32768.0 402918.0 32234036.0
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID,GRBM_COUNT,GRBM_GUI_ACTIVE,CPC_ME1_BUSY_FOR_PACKET_DECODE,SQ_CYCLES,SQ_WAVES,SQ_WAVE_CYCLES,SQ_BUSY_CYCLES,SQ_LEVEL_WAVES,SQ_ACCUM_PREV_HIRES
0,60633,1,1102733,1102733,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0,486977.0,486977.0,287597.0,1947908.0,16384.0,38712781.0,643368.0,0.0,155189876.0
1,60633,1,1102733,1102733,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0,453307.0,453307.0,266894.0,1813228.0,16384.0,34185086.0,537047.0,0.0,137095332.0
2,60633,1,1102733,1102733,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0,417367.0,417367.0,227048.0,1669468.0,16384.0,35398046.0,555090.0,0.0,141949588.0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID GRBM_COUNT GRBM_GUI_ACTIVE CPC_ME1_BUSY_FOR_PACKET_DECODE SQ_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_BUSY_CYCLES SQ_LEVEL_WAVES SQ_ACCUM_PREV_HIRES
2 0 60633 1 1102733 1102733 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676802805 810969676816525 0 486977.0 486977.0 287597.0 1947908.0 16384.0 38712781.0 643368.0 0.0 155189876.0
3 1 60633 1 1102733 1102733 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676837363 810969676849123 0 453307.0 453307.0 266894.0 1813228.0 16384.0 34185086.0 537047.0 0.0 137095332.0
4 2 60633 1 1102733 1102733 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676868522 810969676880402 0 417367.0 417367.0 227048.0 1669468.0 16384.0 35398046.0 555090.0 0.0 141949588.0
@@ -0,0 +1,161 @@
Omniperf version: 2.0.0
Profiler choice: rocprofv2
Path: /home/colramos/omniperf/tests/workloads/vcopy/MI300X_A1
Target: MI300X_A1
Command: ./tests/vcopy -n 1048576 -b 256 -i 3
Kernel Selection: None
Dispatch Selection: None
Hardware Blocks: All
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Collecting Performance Counters
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH_LEVEL
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_LDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_LDS
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_SMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_LEVEL_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACCUM_PREV_HIRES
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - GRBM_COUNT
|-> [/opt/rocm/bin/rocprofv2] - GRBM_GUI_ACTIVE
|-> [/opt/rocm/bin/rocprofv2] - CPC_ME1_BUSY_FOR_PACKET_DECODE
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CYCLES
|-> [/opt/rocm/bin/rocprofv2] - SQ_BUSY_CU_CYCLES
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VMEM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SALU
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VSKIPPED
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_10.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_DATA_ATOMIC_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_STALL
|-> [/opt/rocm/bin/rocprofv2] - SQC_TC_REQ
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_16
|-> [/opt/rocm/bin/rocprofv2] - SQC_ICACHE_REQ
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_11.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_INPUT_VALID_READYB
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_ATOMIC
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_12.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_2
|-> [/opt/rocm/bin/rocprofv2] - SQC_DCACHE_REQ_READ_4
|-> [/opt/rocm/bin/rocprofv2] Enabling Counter Collection
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_13.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_BUBBLE[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_CYCLE[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_ATOMIC[1]
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_14.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_LEVEL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_ATOMIC_LEVEL[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ[1]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_RDREQ_32B[1]
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_15.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_64B[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_EA0_WRREQ_LEVEL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_HIT[0]
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_16.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_MISS[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_READ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_REQ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_RW_REQ[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_MISS[1]
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_17.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_TOO_MANY_EA_WRREQS_STALL[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_WRITE[0]
|-> [/opt/rocm/bin/rocprofv2] - TCC_TAG_STALL[1]
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_ADD_F32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MUL_F32
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_TRANS_F64
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT32
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_INT64
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_BRANCH
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SENDMSG
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_ANY
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAIT_INST_ANY
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_ANY
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_SCA
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_EXP_GDS
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_MISC
|-> [/opt/rocm/bin/rocprofv2] - SQ_ACTIVE_INST_FLAT
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_CYCLES_VMEM_WR
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_CYCLES_VMEM_RD
|-> [/opt/rocm/bin/rocprofv2] - SQ_INST_CYCLES_SMEM
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_THREAD_CYCLES_VALU
|-> [/opt/rocm/bin/rocprofv2] - SQ_IFETCH
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_BANK_CONFLICT
|-> [/opt/rocm/bin/rocprofv2] - SQ_LDS_ADDR_CONFLICT
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_7.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_32
|-> [/opt/rocm/bin/rocprofv2] - SQ_WAVES_LT_16
|-> [/opt/rocm/bin/rocprofv2] - SQ_ITEMS
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_8.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_SMEM_NORM
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_MFMA
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_I8
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_F16
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_9.txt
|-> [/opt/rocm/bin/rocprofv2] ROCProfilerV2: Collecting the following counters:
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_I8
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_BF16
|-> [/opt/rocm/bin/rocprofv2] - SQ_INSTS_VALU_MFMA_MOPS_F32
[profiling] Current input file: tests/workloads/vcopy/MI300X_A1/perfmon/timestamps.txt
|-> [/opt/rocm/bin/rocprofv2] vcopy testing on GCD 0
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the CPU
|-> [/opt/rocm/bin/rocprofv2] Finished allocating vectors on the GPU
|-> [/opt/rocm/bin/rocprofv2] Finished copying vectors to the GPU
[roofline] Roofline temporarily disabled in MI300
@@ -0,0 +1,5 @@
pmc: GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_IFETCH SQ_IFETCH_LEVEL SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_LDS SQ_INST_LEVEL_LDS SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_SMEM SQ_INST_LEVEL_SMEM SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VMEM SQ_INST_LEVEL_VMEM SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: GRBM_COUNT GRBM_GUI_ACTIVE CPC_ME1_BUSY_FOR_PACKET_DECODE SQ_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_BUSY_CYCLES SQ_LEVEL_WAVES SQ_ACCUM_PREV_HIRES
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_CYCLES SQ_BUSY_CYCLES SQ_BUSY_CU_CYCLES SQ_WAVES SQ_WAVE_CYCLES SQ_INSTS_VALU_CVT SQ_INSTS_VMEM_WR SQ_INSTS_VMEM_RD GRBM_COUNT GRBM_GUI_ACTIVE TCP_GATE_EN1_sum TCP_GATE_EN2_sum TCP_TD_TCP_STALL_CYCLES_sum TCP_TCR_TCP_STALL_CYCLES_sum TA_TA_BUSY_sum TA_BUFFER_WAVEFRONTS_sum TD_TD_BUSY_sum TD_TC_STALL_sum SPI_CSN_WINDOW_VALID SPI_CSN_BUSY CPC_CPC_STAT_BUSY CPC_CPC_STAT_IDLE CPF_CPF_STAT_BUSY CPF_CPF_STAT_STALL TCC_CYCLE_sum TCC_BUSY_sum TCC_PROBE_sum TCC_PROBE_ALL_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VMEM SQ_INSTS_SALU SQ_INSTS_VSKIPPED SQ_INSTS SQ_INSTS_VALU SQ_INSTS_VALU_ADD_F16 SQ_INSTS_VALU_MUL_F16 SQ_INSTS_VALU_FMA_F16 GRBM_SPI_BUSY TCP_READ_TAGCONFLICT_STALL_CYCLES_sum TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum TCP_TA_TCP_STATE_READ_sum TA_BUFFER_READ_WAVEFRONTS_sum TA_BUFFER_WRITE_WAVEFRONTS_sum TD_SPI_STALL_sum TD_LOAD_WAVEFRONT_sum SPI_CSN_NUM_THREADGROUPS SPI_CSN_WAVE CPC_CPC_TCIU_BUSY CPC_CPC_TCIU_IDLE CPF_CPF_TCIU_BUSY CPF_CPF_TCIU_STALL TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum TCC_RW_REQ_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_TC_DATA_ATOMIC_REQ SQC_TC_STALL SQC_TC_REQ SQC_DCACHE_REQ_READ_16 SQC_ICACHE_REQ SQC_ICACHE_HITS SQC_ICACHE_MISSES SQC_ICACHE_MISSES_DUPLICATE
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_DCACHE_INPUT_VALID_READYB SQC_DCACHE_ATOMIC SQC_DCACHE_REQ_READ_8 SQC_DCACHE_REQ SQC_DCACHE_HITS SQC_DCACHE_MISSES SQC_DCACHE_MISSES_DUPLICATE SQC_DCACHE_REQ_READ_1
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_DCACHE_REQ_READ_2 SQC_DCACHE_REQ_READ_4
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_ATOMIC[0] TCC_BUBBLE[0] TCC_CYCLE[0] TCC_EA0_ATOMIC[0] TCC_ATOMIC[1] TCC_BUBBLE[1] TCC_CYCLE[1] TCC_EA0_ATOMIC[1] TCC_ATOMIC[2] TCC_BUBBLE[2] TCC_CYCLE[2] TCC_EA0_ATOMIC[2] TCC_ATOMIC[3] TCC_BUBBLE[3] TCC_CYCLE[3] TCC_EA0_ATOMIC[3] TCC_ATOMIC[4] TCC_BUBBLE[4] TCC_CYCLE[4] TCC_EA0_ATOMIC[4] TCC_ATOMIC[5] TCC_BUBBLE[5] TCC_CYCLE[5] TCC_EA0_ATOMIC[5] TCC_ATOMIC[6] TCC_BUBBLE[6] TCC_CYCLE[6] TCC_EA0_ATOMIC[6] TCC_ATOMIC[7] TCC_BUBBLE[7] TCC_CYCLE[7] TCC_EA0_ATOMIC[7] TCC_ATOMIC[8] TCC_BUBBLE[8] TCC_CYCLE[8] TCC_EA0_ATOMIC[8] TCC_ATOMIC[9] TCC_BUBBLE[9] TCC_CYCLE[9] TCC_EA0_ATOMIC[9] TCC_ATOMIC[10] TCC_BUBBLE[10] TCC_CYCLE[10] TCC_EA0_ATOMIC[10] TCC_ATOMIC[11] TCC_BUBBLE[11] TCC_CYCLE[11] TCC_EA0_ATOMIC[11] TCC_ATOMIC[12] TCC_BUBBLE[12] TCC_CYCLE[12] TCC_EA0_ATOMIC[12] TCC_ATOMIC[13] TCC_BUBBLE[13] TCC_CYCLE[13] TCC_EA0_ATOMIC[13] TCC_ATOMIC[14] TCC_BUBBLE[14] TCC_CYCLE[14] TCC_EA0_ATOMIC[14] TCC_ATOMIC[15] TCC_BUBBLE[15] TCC_CYCLE[15] TCC_EA0_ATOMIC[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_EA0_ATOMIC_LEVEL[0] TCC_EA0_RDREQ[0] TCC_EA0_RDREQ_32B[0] TCC_EA0_RDREQ_LEVEL[0] TCC_EA0_ATOMIC_LEVEL[1] TCC_EA0_RDREQ[1] TCC_EA0_RDREQ_32B[1] TCC_EA0_RDREQ_LEVEL[1] TCC_EA0_ATOMIC_LEVEL[2] TCC_EA0_RDREQ[2] TCC_EA0_RDREQ_32B[2] TCC_EA0_RDREQ_LEVEL[2] TCC_EA0_ATOMIC_LEVEL[3] TCC_EA0_RDREQ[3] TCC_EA0_RDREQ_32B[3] TCC_EA0_RDREQ_LEVEL[3] TCC_EA0_ATOMIC_LEVEL[4] TCC_EA0_RDREQ[4] TCC_EA0_RDREQ_32B[4] TCC_EA0_RDREQ_LEVEL[4] TCC_EA0_ATOMIC_LEVEL[5] TCC_EA0_RDREQ[5] TCC_EA0_RDREQ_32B[5] TCC_EA0_RDREQ_LEVEL[5] TCC_EA0_ATOMIC_LEVEL[6] TCC_EA0_RDREQ[6] TCC_EA0_RDREQ_32B[6] TCC_EA0_RDREQ_LEVEL[6] TCC_EA0_ATOMIC_LEVEL[7] TCC_EA0_RDREQ[7] TCC_EA0_RDREQ_32B[7] TCC_EA0_RDREQ_LEVEL[7] TCC_EA0_ATOMIC_LEVEL[8] TCC_EA0_RDREQ[8] TCC_EA0_RDREQ_32B[8] TCC_EA0_RDREQ_LEVEL[8] TCC_EA0_ATOMIC_LEVEL[9] TCC_EA0_RDREQ[9] TCC_EA0_RDREQ_32B[9] TCC_EA0_RDREQ_LEVEL[9] TCC_EA0_ATOMIC_LEVEL[10] TCC_EA0_RDREQ[10] TCC_EA0_RDREQ_32B[10] TCC_EA0_RDREQ_LEVEL[10] TCC_EA0_ATOMIC_LEVEL[11] TCC_EA0_RDREQ[11] TCC_EA0_RDREQ_32B[11] TCC_EA0_RDREQ_LEVEL[11] TCC_EA0_ATOMIC_LEVEL[12] TCC_EA0_RDREQ[12] TCC_EA0_RDREQ_32B[12] TCC_EA0_RDREQ_LEVEL[12] TCC_EA0_ATOMIC_LEVEL[13] TCC_EA0_RDREQ[13] TCC_EA0_RDREQ_32B[13] TCC_EA0_RDREQ_LEVEL[13] TCC_EA0_ATOMIC_LEVEL[14] TCC_EA0_RDREQ[14] TCC_EA0_RDREQ_32B[14] TCC_EA0_RDREQ_LEVEL[14] TCC_EA0_ATOMIC_LEVEL[15] TCC_EA0_RDREQ[15] TCC_EA0_RDREQ_32B[15] TCC_EA0_RDREQ_LEVEL[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_EA0_WRREQ[0] TCC_EA0_WRREQ_64B[0] TCC_EA0_WRREQ_LEVEL[0] TCC_HIT[0] TCC_EA0_WRREQ[1] TCC_EA0_WRREQ_64B[1] TCC_EA0_WRREQ_LEVEL[1] TCC_HIT[1] TCC_EA0_WRREQ[2] TCC_EA0_WRREQ_64B[2] TCC_EA0_WRREQ_LEVEL[2] TCC_HIT[2] TCC_EA0_WRREQ[3] TCC_EA0_WRREQ_64B[3] TCC_EA0_WRREQ_LEVEL[3] TCC_HIT[3] TCC_EA0_WRREQ[4] TCC_EA0_WRREQ_64B[4] TCC_EA0_WRREQ_LEVEL[4] TCC_HIT[4] TCC_EA0_WRREQ[5] TCC_EA0_WRREQ_64B[5] TCC_EA0_WRREQ_LEVEL[5] TCC_HIT[5] TCC_EA0_WRREQ[6] TCC_EA0_WRREQ_64B[6] TCC_EA0_WRREQ_LEVEL[6] TCC_HIT[6] TCC_EA0_WRREQ[7] TCC_EA0_WRREQ_64B[7] TCC_EA0_WRREQ_LEVEL[7] TCC_HIT[7] TCC_EA0_WRREQ[8] TCC_EA0_WRREQ_64B[8] TCC_EA0_WRREQ_LEVEL[8] TCC_HIT[8] TCC_EA0_WRREQ[9] TCC_EA0_WRREQ_64B[9] TCC_EA0_WRREQ_LEVEL[9] TCC_HIT[9] TCC_EA0_WRREQ[10] TCC_EA0_WRREQ_64B[10] TCC_EA0_WRREQ_LEVEL[10] TCC_HIT[10] TCC_EA0_WRREQ[11] TCC_EA0_WRREQ_64B[11] TCC_EA0_WRREQ_LEVEL[11] TCC_HIT[11] TCC_EA0_WRREQ[12] TCC_EA0_WRREQ_64B[12] TCC_EA0_WRREQ_LEVEL[12] TCC_HIT[12] TCC_EA0_WRREQ[13] TCC_EA0_WRREQ_64B[13] TCC_EA0_WRREQ_LEVEL[13] TCC_HIT[13] TCC_EA0_WRREQ[14] TCC_EA0_WRREQ_64B[14] TCC_EA0_WRREQ_LEVEL[14] TCC_HIT[14] TCC_EA0_WRREQ[15] TCC_EA0_WRREQ_64B[15] TCC_EA0_WRREQ_LEVEL[15] TCC_HIT[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_MISS[0] TCC_READ[0] TCC_REQ[0] TCC_RW_REQ[0] TCC_MISS[1] TCC_READ[1] TCC_REQ[1] TCC_RW_REQ[1] TCC_MISS[2] TCC_READ[2] TCC_REQ[2] TCC_RW_REQ[2] TCC_MISS[3] TCC_READ[3] TCC_REQ[3] TCC_RW_REQ[3] TCC_MISS[4] TCC_READ[4] TCC_REQ[4] TCC_RW_REQ[4] TCC_MISS[5] TCC_READ[5] TCC_REQ[5] TCC_RW_REQ[5] TCC_MISS[6] TCC_READ[6] TCC_REQ[6] TCC_RW_REQ[6] TCC_MISS[7] TCC_READ[7] TCC_REQ[7] TCC_RW_REQ[7] TCC_MISS[8] TCC_READ[8] TCC_REQ[8] TCC_RW_REQ[8] TCC_MISS[9] TCC_READ[9] TCC_REQ[9] TCC_RW_REQ[9] TCC_MISS[10] TCC_READ[10] TCC_REQ[10] TCC_RW_REQ[10] TCC_MISS[11] TCC_READ[11] TCC_REQ[11] TCC_RW_REQ[11] TCC_MISS[12] TCC_READ[12] TCC_REQ[12] TCC_RW_REQ[12] TCC_MISS[13] TCC_READ[13] TCC_REQ[13] TCC_RW_REQ[13] TCC_MISS[14] TCC_READ[14] TCC_REQ[14] TCC_RW_REQ[14] TCC_MISS[15] TCC_READ[15] TCC_REQ[15] TCC_RW_REQ[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: TCC_TAG_STALL[0] TCC_TOO_MANY_EA_WRREQS_STALL[0] TCC_WRITE[0] TCC_TAG_STALL[1] TCC_TOO_MANY_EA_WRREQS_STALL[1] TCC_WRITE[1] TCC_TAG_STALL[2] TCC_TOO_MANY_EA_WRREQS_STALL[2] TCC_WRITE[2] TCC_TAG_STALL[3] TCC_TOO_MANY_EA_WRREQS_STALL[3] TCC_WRITE[3] TCC_TAG_STALL[4] TCC_TOO_MANY_EA_WRREQS_STALL[4] TCC_WRITE[4] TCC_TAG_STALL[5] TCC_TOO_MANY_EA_WRREQS_STALL[5] TCC_WRITE[5] TCC_TAG_STALL[6] TCC_TOO_MANY_EA_WRREQS_STALL[6] TCC_WRITE[6] TCC_TAG_STALL[7] TCC_TOO_MANY_EA_WRREQS_STALL[7] TCC_WRITE[7] TCC_TAG_STALL[8] TCC_TOO_MANY_EA_WRREQS_STALL[8] TCC_WRITE[8] TCC_TAG_STALL[9] TCC_TOO_MANY_EA_WRREQS_STALL[9] TCC_WRITE[9] TCC_TAG_STALL[10] TCC_TOO_MANY_EA_WRREQS_STALL[10] TCC_WRITE[10] TCC_TAG_STALL[11] TCC_TOO_MANY_EA_WRREQS_STALL[11] TCC_WRITE[11] TCC_TAG_STALL[12] TCC_TOO_MANY_EA_WRREQS_STALL[12] TCC_WRITE[12] TCC_TAG_STALL[13] TCC_TOO_MANY_EA_WRREQS_STALL[13] TCC_WRITE[13] TCC_TAG_STALL[14] TCC_TOO_MANY_EA_WRREQS_STALL[14] TCC_WRITE[14] TCC_TAG_STALL[15] TCC_TOO_MANY_EA_WRREQS_STALL[15] TCC_WRITE[15]
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_TRANS_F16 SQ_INSTS_VALU_ADD_F32 SQ_INSTS_VALU_MUL_F32 SQ_INSTS_VALU_FMA_F32 SQ_INSTS_VALU_TRANS_F32 SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MUL_F64 SQ_INSTS_VALU_FMA_F64 TCP_VOLATILE_sum TCP_TOTAL_ACCESSES_sum TCP_TOTAL_READ_sum TCP_TOTAL_WRITE_sum TA_BUFFER_ATOMIC_WAVEFRONTS_sum TA_BUFFER_TOTAL_CYCLES_sum TD_ATOMIC_WAVEFRONT_sum TD_STORE_WAVEFRONT_sum SPI_RA_REQ_NO_ALLOC SPI_RA_REQ_NO_ALLOC_CSN CPC_CPC_STAT_STALL CPC_UTCL1_STALL_ON_TRANSLATION CPF_CPF_STAT_IDLE CPF_CPF_TCIU_IDLE TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_TRANS_F64 SQ_INSTS_VALU_INT32 SQ_INSTS_VALU_INT64 SQ_INSTS_SMEM SQ_INSTS_FLAT SQ_INSTS_LDS SQ_INSTS_GDS SQ_INSTS_EXP_GDS TCP_TOTAL_ATOMIC_WITH_RET_sum TCP_TOTAL_ATOMIC_WITHOUT_RET_sum TCP_TOTAL_WRITEBACK_INVALIDATES_sum TCP_TOTAL_CACHE_ACCESSES_sum TA_BUFFER_COALESCED_READ_CYCLES_sum TA_BUFFER_COALESCED_WRITE_CYCLES_sum TD_COALESCABLE_WAVEFRONT_sum SPI_RA_RES_STALL_CSN SPI_RA_TMP_STALL_CSN CPC_CPC_UTCL2IU_BUSY CPC_CPC_UTCL2IU_IDLE CPF_CMP_UTCL1_STALL_ON_TRANSLATION TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_BRANCH SQ_INSTS_SENDMSG SQ_WAIT_ANY SQ_WAIT_INST_ANY SQ_ACTIVE_INST_ANY SQ_ACTIVE_INST_VMEM SQ_ACTIVE_INST_LDS SQ_ACTIVE_INST_VALU TCP_UTCL1_TRANSLATION_MISS_sum TCP_UTCL1_TRANSLATION_HIT_sum TCP_UTCL1_PERMISSION_MISS_sum TCP_UTCL1_REQUEST_sum TA_ADDR_STALLED_BY_TC_CYCLES_sum TA_TOTAL_WAVEFRONTS_sum SPI_RA_WAVE_SIMD_FULL_CSN SPI_RA_VGPR_SIMD_FULL_CSN CPC_CPC_UTCL2IU_STALL CPC_ME1_BUSY_FOR_PACKET_DECODE TCC_EA0_WRREQ_sum TCC_EA0_WRREQ_64B_sum TCC_EA0_WR_UNCACHED_32B_sum TCC_EA0_WRREQ_DRAM_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_ACTIVE_INST_SCA SQ_ACTIVE_INST_EXP_GDS SQ_ACTIVE_INST_MISC SQ_ACTIVE_INST_FLAT SQ_INST_CYCLES_VMEM_WR SQ_INST_CYCLES_VMEM_RD SQ_INST_CYCLES_SMEM SQ_INST_CYCLES_SALU TCP_TCC_READ_REQ_sum TCP_TCC_WRITE_REQ_sum TCP_TCC_ATOMIC_WITH_RET_REQ_sum TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum TA_ADDR_STALLED_BY_TD_CYCLES_sum TA_DATA_STALLED_BY_TC_CYCLES_sum SPI_RA_SGPR_SIMD_FULL_CSN SPI_RA_LDS_CU_FULL_CSN CPC_ME1_DC0_SPI_BUSY TCC_EA0_RDREQ_sum TCC_EA0_RDREQ_32B_sum TCC_BUBBLE_sum TCC_EA0_RD_UNCACHED_32B_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_THREAD_CYCLES_VALU SQ_IFETCH SQ_LDS_BANK_CONFLICT SQ_LDS_ADDR_CONFLICT SQ_LDS_UNALIGNED_STALL SQ_WAVES_EQ_64 SQ_WAVES_LT_64 SQ_WAVES_LT_48 TCP_TCC_NC_READ_REQ_sum TCP_TCC_NC_WRITE_REQ_sum TCP_TCC_NC_ATOMIC_REQ_sum TCP_TCC_UC_READ_REQ_sum TA_FLAT_WAVEFRONTS_sum TA_FLAT_READ_WAVEFRONTS_sum SPI_RA_BAR_CU_FULL_CSN SPI_RA_TGLIM_CU_FULL_CSN TCC_EA0_RDREQ_DRAM_sum TCC_TAG_STALL_sum TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_WAVES_LT_32 SQ_WAVES_LT_16 SQ_ITEMS SQ_LDS_MEM_VIOLATIONS SQ_LDS_ATOMIC_RETURN SQ_LDS_IDX_ACTIVE SQ_WAVES_RESTORED SQ_WAVES_SAVED TCP_TCC_UC_WRITE_REQ_sum TCP_TCC_UC_ATOMIC_REQ_sum TCP_TCC_CC_READ_REQ_sum TCP_TCC_CC_WRITE_REQ_sum TA_FLAT_WRITE_WAVEFRONTS_sum TA_FLAT_ATOMIC_WAVEFRONTS_sum SPI_RA_WVLIM_STALL_CSN SPI_SWC_CSC_WR TCC_NORMAL_EVICT_sum TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA0_ATOMIC_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_SMEM_NORM SQ_INSTS_MFMA SQ_INSTS_VALU_MFMA_I8 SQ_INSTS_VALU_MFMA_F16 SQ_INSTS_VALU_MFMA_BF16 SQ_INSTS_VALU_MFMA_F32 SQ_INSTS_VALU_MFMA_F64 SQ_VALU_MFMA_BUSY_CYCLES TCP_TCC_CC_ATOMIC_REQ_sum TCP_TCC_RW_READ_REQ_sum TCP_TCC_RW_WRITE_REQ_sum TCP_TCC_RW_ATOMIC_REQ_sum SPI_VWC_CSC_WR SPI_RA_BULKY_CU_FULL_CSN TCC_EA0_RDREQ_LEVEL_sum TCC_EA0_WRREQ_LEVEL_sum TCC_EA0_ATOMIC_LEVEL_sum TCC_EA0_WRREQ_STALL_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_MFMA_MOPS_I8 SQ_INSTS_VALU_MFMA_MOPS_F16 SQ_INSTS_VALU_MFMA_MOPS_BF16 SQ_INSTS_VALU_MFMA_MOPS_F32 SQ_INSTS_VALU_MFMA_MOPS_F64 SQC_TC_INST_REQ SQC_TC_DATA_READ_REQ SQC_TC_DATA_WRITE_REQ TCP_PENDING_STALL_CYCLES_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc:
gpu:
range:
kernel:
Las diferiencias del archivo han sido suprimidas porque una o mas lineas son muy largas
@@ -0,0 +1,2 @@
workload_name,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_model,gpu_arch,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,hbm_bw,num_xcd
vcopy,./tests/vcopy -n 1048576 -b 256 -i 3,SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Thu 30 May 2024 02:19:39 PM (CDT),2,splinter-126-wr-c6,AMD Ryzen 9 7950X 16-Core Processor,"American Megatrends International, LLC.VS2683299N.FD",Ubuntu 22.04.4 LTS,5.18.2-mi300-build-140423-ubuntu-22.04+,,114656528,,6.2.0-13611,113-MI3SRIOV-001,SPX,NPS1,MI300X_A1,gfx942,32,4096,304,4,32,64,1024,32,2100,1300,2100,1300,128,32,160,4,5324.8,8
1 workload_name command ip_blocks timestamp version hostname cpu_model sbios linux_distro linux_kernel_version amd_gpu_kernel_version cpu_memory gpu_memory rocm_version vbios compute_partition memory_partition gpu_model gpu_arch gpu_l1 gpu_l2 cu_per_gpu simd_per_cu se_per_gpu wave_size workgroup_max_size max_waves_per_cu max_sclk max_mclk cur_sclk cur_mclk total_l2_chan lds_banks_per_cu sqc_per_gpu pipes_per_gpu hbm_bw num_xcd
2 vcopy ./tests/vcopy -n 1048576 -b 256 -i 3 SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF Thu 30 May 2024 02:19:39 PM (CDT) 2 splinter-126-wr-c6 AMD Ryzen 9 7950X 16-Core Processor American Megatrends International, LLC.VS2683299N.FD Ubuntu 22.04.4 LTS 5.18.2-mi300-build-140423-ubuntu-22.04+ 114656528 6.2.0-13611 113-MI3SRIOV-001 SPX NPS1 MI300X_A1 gfx942 32 4096 304 4 32 64 1024 32 2100 1300 2100 1300 128 32 160 4 5324.8 8
@@ -0,0 +1,4 @@
Dispatch_ID,GPU_ID,Queue_ID,PID,TID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Wave_Size,Kernel_Name,Start_Timestamp,End_Timestamp,Correlation_ID
1,60633,1,1103031,1103031,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676802805,810969676816525,0
2,60633,1,1103031,1103031,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676837363,810969676849123,0
3,60633,1,1103031,1103031,1048576,256,0,0,4,4,16,64,"vecCopy(double*, double*, double*, int, int) (.kd)",810969676868522,810969676880402,0
1 Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID
2 1 60633 1 1103031 1103031 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676802805 810969676816525 0
3 2 60633 1 1103031 1103031 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676837363 810969676849123 0
4 3 60633 1 1103031 1103031 1048576 256 0 0 4 4 16 64 vecCopy(double*, double*, double*, int, int) (.kd) 810969676868522 810969676880402 0