diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py index 769817c6d3..9ff5ad70f1 100644 --- a/projects/rocprofiler-compute/src/argparser.py +++ b/projects/rocprofiler-compute/src/argparser.py @@ -52,11 +52,11 @@ def omniarg_parser(parser, omniperf_home, omniperf_version): \n\n------------------------------------------------------------------------------- \nExamples: - \n\tomniperf profile -n vcopy_all -- ./vcopy 1048576 256 - \n\tomniperf profile -n vcopy_SPI_TCC -b SQ TCC -- ./vcopy 1048576 256 - \n\tomniperf profile -n vcopy_kernel -k vecCopy -- ./vcopy 1048576 256 - \n\tomniperf profile -n vcopy_disp -d 0 -- ./vcopy 1048576 256 - \n\tomniperf profile -n vcopy_roof --roof-only -- ./vcopy 1048576 256 + \n\tomniperf profile -n vcopy_all -- ./vcopy -n 1048576 -b 256 + \n\tomniperf profile -n vcopy_SPI_TCC -b SQ TCC -- ./vcopy -n 1048576 -b 256 + \n\tomniperf profile -n vcopy_kernel -k vecCopy -- ./vcopy -n 1048576 -b 256 + \n\tomniperf profile -n vcopy_disp -d 0 -- ./vcopy -n 1048576 -b 256 + \n\tomniperf profile -n vcopy_roof --roof-only -- ./vcopy -n 1048576 -b 256 \n-------------------------------------------------------------------------------\n """, prog="tool", diff --git a/projects/rocprofiler-compute/src/docs/getting_started.md b/projects/rocprofiler-compute/src/docs/getting_started.md index 80ae888f06..5b9f8c0f9e 100644 --- a/projects/rocprofiler-compute/src/docs/getting_started.md +++ b/projects/rocprofiler-compute/src/docs/getting_started.md @@ -14,7 +14,7 @@ To collect the default set of data for all kernels in the target application, launch, e.g.: ```shell - $ omniperf profile -n vcopy_data -- ./vcopy 1048576 256 + $ omniperf profile -n vcopy_data -- ./vcopy -n 1048576 -b 256 ``` The app runs, each kernel is launched, and profiling results are generated. By default, results are written to (e.g.,) ./workloads/vcopy_data (configurable via the `-n` argument). To collect all requested profile information, it may be required to replay kernels multiple times. diff --git a/projects/rocprofiler-compute/src/docs/profiling.md b/projects/rocprofiler-compute/src/docs/profiling.md index 56c234604e..633cb9eec5 100644 --- a/projects/rocprofiler-compute/src/docs/profiling.md +++ b/projects/rocprofiler-compute/src/docs/profiling.md @@ -23,7 +23,7 @@ the MI200 platform. $ hipcc vcopy.cpp -o vcopy $ ls vcopy vcopy.cpp -$ ./vcopy 1048576 256 +$ ./vcopy -n 1048576 -b 256 Finished allocating vectors on the CPU Finished allocating vectors on the GPU Finished copying vectors to the GPU @@ -42,44 +42,43 @@ The *omniperf* script, availible through the [Omniperf](https://github.com/AMDRe **omniperf help:** ```shell-session $ omniperf profile --help -ROC Profiler: /usr/bin/rocprof - -usage: - +usage: + omniperf profile --name [profile options] [roofline options] -- - + ------------------------------------------------------------------------------- - + Examples: - - omniperf profile -n vcopy_all -- ./vcopy 1048576 256 - - omniperf profile -n vcopy_SPI_TCC -b SQ TCC -- ./vcopy 1048576 256 - - omniperf profile -n vcopy_kernel -k vecCopy -- ./vcopy 1048576 256 - - omniperf profile -n vcopy_disp -d 0 -- ./vcopy 1048576 256 - - omniperf profile -n vcopy_roof --roof-only -- ./vcopy 1048576 256 - + + omniperf profile -n vcopy_all -- ./vcopy -n 1048576 -b 256 + + omniperf profile -n vcopy_SPI_TCC -b SQ TCC -- ./vcopy -n 1048576 -b 256 + + omniperf profile -n vcopy_kernel -k vecCopy -- ./vcopy -n 1048576 -b 256 + + omniperf profile -n vcopy_disp -d 0 -- ./vcopy -n 1048576 -b 256 + + omniperf profile -n vcopy_roof --roof-only -- ./vcopy -n 1048576 -b 256 + ------------------------------------------------------------------------------- - + Help: - -h, --help show this help message and exit + -h, --help show this help message and exit General Options: - -v, --version show program's version number and exit - -V, --verbose Increase output verbosity + -v, --version show program's version number and exit + -V, --verbose Increase output verbosity Profile Options: -n , --name Assign a name to workload. -p , --path Specify path to save workload. - (DEFAULT: /home/colramos/GitHub/omniperf/workloads/) + (DEFAULT: /home/colramos/GitHub/omniperf-pub/workloads/) -k [ ...], --kernel [ ...] Kernel filtering. + -d [ ...], --dispatch [ ...] Dispatch ID filtering. -b [ ...], --ipblocks [ ...] IP block filtering: SQ SQC @@ -90,16 +89,19 @@ Profile Options: SPI CPC CPF - -d [ ...], --dispatch [ ...] Dispatch ID filtering. + --join-type Choose how to join rocprof runs: (DEFAULT: grid) + kernel (i.e. By unique kernel name dispatches) + grid (i.e. By unique kernel name + grid size dispatches) --no-roof Profile without collecting roofline data. -- [ ...] Provide command for profiling after double dash. + --kernel-verbose Specify Kernel Name verbose level 1-5. Lower the level, shorter the kernel name. (DEFAULT: 2) (DISABLE: 5) Standalone Roofline Options: --roof-only Profile roofline data only. --sort Overlay top kernels or top dispatches: (DEFAULT: kernels) kernels dispatches - -m , --mem-level Filter by memory level: (DEFAULT: ALL) + -m [ ...], --mem-level [ ...] Filter by memory level: (DEFAULT: ALL) HBM L2 vL1D @@ -113,36 +115,40 @@ The following sample command profiles the *vcopy* workload. **vcopy profiling:** ```shell-session $ omniperf profile --name vcopy -- ./vcopy 1048576 256 -Resolving rocprof -ROC Profiler: /usr/bin/rocprof +ROC Profiler: /opt/rocm-5.2.1/bin/rocprof +Execution mode = profile + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| +| | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ +| |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| - -------------- -Profile only -------------- - -omniperf ver: 1.0.8-PR1 -Path: /home/colramos/GitHub/omniperf-pub/workloads -Target: mi200 -Command: /home/colramos/vcopy 1048576 256 -Kernel Selection: None -Dispatch Selection: None +SoC = mi200 +Profiler choice = rocprofv1 +omniperf ver: 1.0.10 +Path: /home/colramos/omniperf/workloads/vcopy_test/mi200 +Target: mi200 +Command: ./vcopy -n 1048576 -b 256 +Kernel Selection: None +Dispatch Selection: None IP Blocks: All -Log: /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt +KernelName verbose: 2 -/home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/perfmon/SQ_INST_LEVEL_SMEM.txt -RPL: on '230411_165021' from '/opt/rocm-5.2.1' in '/home/colramos/GitHub/omniperf-pub' -RPL: profiling '""/home/colramos/vcopy 1048576 256""' -RPL: input file '/home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/perfmon/SQ_INST_LEVEL_SMEM.txt' -RPL: output dir '/tmp/rpl_data_230411_165021_26406' -RPL: result dir '/tmp/rpl_data_230411_165021_26406/input0_results_230411_165021' -Finished allocating vectors on the CPU -ROCProfiler: input from "/tmp/rpl_data_230411_165021_26406/input0.xml" +Current input file: /home/colramos/omniperf/workloads/vcopy_test/mi200/perfmon/pmc_perf_2.txt +RPL: on '231120_142934' from '/opt/rocm-5.2.1' in '/home/colramos/omniperf' +RPL: profiling '""./vcopy -n 1048576 -b 256""' +RPL: input file '/home/colramos/omniperf/workloads/vcopy_test/mi200/perfmon/pmc_perf_2.txt' +RPL: output dir '/tmp/rpl_data_231120_142934_262088' +RPL: result dir '/tmp/rpl_data_231120_142934_262088/input0_results_231120_142934' +ROCProfiler: input from "/tmp/rpl_data_231120_142934_262088/input0.xml" gpu_index = kernel = range = - 3 metrics - SQ_INSTS_SMEM, SQ_INST_LEVEL_SMEM, SQ_ACCUM_PREV_HIRES + 26 metrics + SQ_INSTS_VALU_TRANS_F16, SQ_INSTS_VALU_ADD_F32, SQ_INSTS_VALU_MUL_F32, SQ_INSTS_VALU_FMA_F32, SQ_INSTS_VALU_TRANS_F32, SQ_INSTS_VALU_ADD_F64, SQ_INSTS_VALU_MUL_F64, SQ_INSTS_VALU_FMA_F64, TCP_VOLATILE_sum, TCP_TOTAL_ACCESSES_sum, TCP_TOTAL_READ_sum, TCP_TOTAL_WRITE_sum, TA_BUFFER_ATOMIC_WAVEFRONTS_sum, TA_BUFFER_TOTAL_CYCLES_sum, TD_ATOMIC_WAVEFRONT_sum, TD_STORE_WAVEFRONT_sum, SPI_RA_REQ_NO_ALLOC, SPI_RA_REQ_NO_ALLOC_CSN, CPC_CPC_STAT_STALL, CPC_UTCL1_STALL_ON_TRANSLATION, CPF_CPF_STAT_IDLE, CPF_CPF_TCIU_IDLE, TCC_REQ_sum, TCC_STREAMING_REQ_sum, TCC_HIT_sum, TCC_MISS_sum +vcopy testing on GCD 0 +Finished allocating vectors on the CPU Finished allocating vectors on the GPU Finished copying vectors to the GPU sw thinks it moved 1.000000 KB per wave @@ -153,9 +159,10 @@ Finished copying the output vector from the GPU to the CPU Releasing GPU memory Releasing CPU memory -... ... -ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_220527_130317_1787038/input_results_220527_130317 -File 'workloads/vcopy/mi200/timestamps.csv' is generating +... + +Empirical Roofline Calculation +Copyright © 2022 Advanced Micro Devices, Inc. All rights reserved. Total detected GPU devices: 2 GPU Device 0: Profiling... 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] @@ -213,7 +220,7 @@ etc. The SOC names are generated as a part of Omniperf, and do not necessarily > Note: Additionally, you'll notice a few extra files. An SoC parameters file, *sysinfo.csv*, is created to reflect the target device settings. All profiling output is stored in *log.txt*. Roofline specific benchmark results are stored in *roofline.csv*. -```shell +```shell-session $ ls workloads/vcopy/mi200/ total 112 drwxrwxr-x 3 colramos colramos 4096 Apr 11 16:42 . @@ -254,39 +261,42 @@ i.e. filtering dispatch X, but dispatch X does not match your kernel name filter One can profile a selected IP Block to speed up the profiling process. All profiling results are accumulated in the same target directory, without overwriting those for other IP blocks, hence enabling the incremental profiling and analysis. The following example only gathers hardware counters for SQ and TCC, skipping all other IP Blocks: -```shell -$ omniperf profile --name vcopy -b SQ TCC -- ./sample/vcopy 1048576 256 -Resolving rocprof -ROC Profiler: /usr/bin/rocprof +```shell-session +$ omniperf profile --name vcopy -b SQ TCC -- ./sample/vcopy -n 1048576 -b 256 +ROC Profiler: /opt/rocm-5.2.1/bin/rocprof +Execution mode = profile + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| +| | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ +| |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| - -------------- -Profile only -------------- - -omniperf ver: 1.0.8-PR1 -Path: /home/colramos/GitHub/omniperf-pub/workloads -Target: mi200 -Command: /home/colramos/vcopy 1048576 256 -Kernel Selection: None -Dispatch Selection: None -IP Blocks: ['SQ', 'TCC'] -fname: pmc_sq_perf2: Added +SoC = mi200 +Profiler choice = rocprofv1 +fname: pmc_cpc_perf: Skipped +fname: pmc_spi_perf: Skipped +fname: pmc_cpf_perf: Skipped +fname: pmc_tcp_perf: Skipped +fname: pmc_sq_perf4: Added +fname: pmc_tcc_perf: Added +fname: pmc_sq_perf8: Added +fname: pmc_ta_perf: Skipped +fname: pmc_sq_perf1: Added +fname: pmc_sq_perf3: Added fname: pmc_td_perf: Skipped fname: pmc_tcc2_perf: Skipped -fname: pmc_tcp_perf: Skipped -fname: pmc_spi_perf: Skipped -fname: pmc_sq_perf4: Added fname: pmc_sqc_perf1: Skipped -fname: pmc_tcc_perf: Added -fname: pmc_cpf_perf: Skipped -fname: pmc_sq_perf8: Added -fname: pmc_cpc_perf: Skipped -fname: pmc_sq_perf1: Added -fname: pmc_ta_perf: Skipped -fname: pmc_sq_perf3: Added fname: pmc_sq_perf6: Added -Log: /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt +fname: pmc_sq_perf2: Added +omniperf ver: 1.0.10 +Path: /home/colramos/omniperf/vcopy_/mi200 +Target: mi200 +Command: ./vcopy -n 1048576 -b 256 +Kernel Selection: None +Dispatch Selection: None +IP Blocks: ['sq', 'tcc'] +KernelName verbose: 2 ... ``` @@ -294,37 +304,28 @@ Log: /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt Kernel filtering is based on the name of the kernel(s) you'd like to isolate. Use a kernel name substring list to isolate desired kernels. The following example demonstrates profiling isolating the kernel matching substring "vecCopy": -```shell -$ omniperf profile --name vcopy -k vecCopy -- ./vcopy 1048576 256 -Resolving rocprof -ROC Profiler: /usr/bin/rocprof +```shell-session +$ omniperf profile --name vcopy -k vecCopy -- ./vcopy -n 1048576 -b 256 +ROC Profiler: /opt/rocm-5.2.1/bin/rocprof +Execution mode = profile + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| +| | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ +| |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| - -------------- -Profile only -------------- - -omniperf ver: 1.0.8-PR1 -Path: /home/colramos/GitHub/omniperf-pub/workloads -Target: mi200 -Command: /home/colramos/vcopy 1048576 256 -Kernel Selection: ['vecCopy'] -Dispatch Selection: None +SoC = mi200 +Profiler choice = rocprofv1 +omniperf ver: 1.0.10 +Path: /home/colramos/omniperf/workloads/vcopy/mi200 +Target: mi200 +Command: ./vcopy -n 1048576 -b 256 +Kernel Selection: ['vecCopy'] +Dispatch Selection: None IP Blocks: All -Log: /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt - -/home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/perfmon/SQ_INST_LEVEL_SMEM.txt -RPL: on '230411_170300' from '/opt/rocm-5.2.1' in '/home/colramos/GitHub/omniperf-pub' -RPL: profiling '""/home/colramos/vcopy 1048576 256""' -RPL: input file '/home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/perfmon/SQ_INST_LEVEL_SMEM.txt' -RPL: output dir '/tmp/rpl_data_230411_170300_29696' -RPL: result dir '/tmp/rpl_data_230411_170300_29696/input0_results_230411_170300' -Finished allocating vectors on the CPU -ROCProfiler: input from "/tmp/rpl_data_230411_170300_29696/input0.xml" - gpu_index = - kernel = vecCopy - -... ... +KernelName verbose: 2 +... ``` #### Dispatch Filtering @@ -332,35 +333,26 @@ Dispatch filtering is based on the *global* dispatch index of kernels in a run. The following example profiles only the 0th dispatched kernel in execution of the application: ```shell-session -$ omniperf profile --name vcopy -d 0 -- ./vcopy 1048576 256 -Resolving rocprof -ROC Profiler: /usr/bin/rocprof +$ omniperf profile --name vcopy -d 0 -- ./vcopy -n 1048576 -b 256 +ROC Profiler: /opt/rocm-5.2.1/bin/rocprof +Execution mode = profile + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| +| | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ +| |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| - -------------- -Profile only -------------- - -omniperf ver: 1.0.8-PR1 -Path: /home/colramos/GitHub/omniperf-pub/workloads -Target: mi200 -Command: /home/colramos/vcopy 1048576 256 -Kernel Selection: None -Dispatch Selection: ['0'] +SoC = mi200 +Profiler choice = rocprofv1 +omniperf ver: 1.0.10 +Path: /home/colramos/omniperf/workloads/vcopy/mi200 +Target: mi200 +Command: ./vcopy -n 1048576 -b 256 +Kernel Selection: None +Dispatch Selection: ['0'] IP Blocks: All -Log: /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt - -/home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/perfmon/SQ_INST_LEVEL_SMEM.txt -RPL: on '230411_170356' from '/opt/rocm-5.2.1' in '/home/colramos/GitHub/omniperf-pub' -RPL: profiling '""/home/colramos/vcopy 1048576 256""' -RPL: input file '/home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/perfmon/SQ_INST_LEVEL_SMEM.txt' -RPL: output dir '/tmp/rpl_data_230411_170356_30314' -RPL: result dir '/tmp/rpl_data_230411_170356_30314/input0_results_230411_170356' -Finished allocating vectors on the CPU -ROCProfiler: input from "/tmp/rpl_data_230411_170356_30314/input0.xml" - gpu_index = - kernel = - range = 0 +KernelName verbose: 2 ... ``` @@ -383,27 +375,28 @@ Standalone Roofline Options: #### Roofline Only The following example demonstrates profiling roofline data only: ```shell-session -$ omniperf profile --name vcopy --roof-only -- ./vcopy 1048576 256 -Resolving rocprof -ROC Profiler: /usr/bin/rocprof +$ omniperf profile --name vcopy --roof-only -- ./vcopy -n 1048576 -b 256 +ROC Profiler: /opt/rocm-5.2.1/bin/rocprof +Execution mode = profile + ___ _ __ + / _ \ _ __ ___ _ __ (_)_ __ ___ _ __ / _| +| | | | '_ ` _ \| '_ \| | '_ \ / _ \ '__| |_ +| |_| | | | | | | | | | | |_) | __/ | | _| + \___/|_| |_| |_|_| |_|_| .__/ \___|_| |_| + |_| - --------- -Roofline only --------- - -Checking for roofline.csv in /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200 -No roofline data found. Generating... -Empirical Roofline Calculation -Copyright © 2022 Advanced Micro Devices, Inc. All rights reserved. -Total detected GPU devices: 4 -GPU Device 0: Profiling... - 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] - ... ... -Checking for roofline.csv in /home/colramos/GitHub/omniperf-pub/workloads/mix/mi200 -Checking for sysinfo.csv in /home/colramos/GitHub/omniperf-pub/workloads/mix/mi200 -Checking for pmc_perf.csv in /home/colramos/GitHub/omniperf-pub/workloads/mix/mi200 -Empirical Roofline PDFs saved! +SoC = mi200 +Profiler choice = rocprofv1 +[roofline] Generating pmc_perf.csv +omniperf ver: 1.0.10 +Path: /home/colramos/omniperf/workloads/vcopy_test/mi200 +Target: mi200 +Command: ./vcopy -n 1048576 -b 256 +Kernel Selection: None +Dispatch Selection: None +IP Blocks: All +KernelName verbose: 2 +... ``` An inspection of our workload output folder shows .pdf plots were generated successfully ```shell-session