[rocprofv3-benchmark] SDK and rocprofv3 Benchmarking Suite (#157)
* Adding Benchmarking Stg1
* config fix
* reset
* add jpeg and decode traces in iteration
* address comments benchmark config files.
* address comments.
* address comments.
* address comments: revert cntrl ctx.
* address comments: revert csv output.
* resolve merge conflits.
* format.
* build fix.
* fix hip runtime api traces.
* loop cb services.
* format.
* bug fix.
* Fix operator>
- public C++ comparison operator
* Update configuration options
- support selected regions (--selected-regions)
- support writing output config json (--output-config)
- update serialization data
* rocprofv3 tool library misc updates
- lambda for starting context
- support for writing config json
* Tool library updates
- Finished support for all benchmarking modes
- Added build spec support to config json
* Fix ROCPROFILER_SOVERSION
- this value should not be multiplied by 10,000
* Minor tweak to rocprofv3
* Benchmarking scripts
* formatting
* Fix duplicate include
* Add reproducible-dispatch-count test app
- used in benchmarking
* registration logging
- report number of registered contexts and active contexts after client initialization
* Serialize environment in rocprofv3 output config
* ROCPROFILER_BUILD_BENCHMARK CMake option
* Update benchmark SQL schema
- hash_id is text
- add md5sum to benchmarked_app
- remove app_id from benchmarked_sdk
- add sdk_id to benchmark_config
- separate hip_trace into hip_runtime_trace and hip_compiler_trace
- use INT instead of INTEGER for MySQL compatibility
- add count column in benchmark_statistics
- allow std_dev to be NULL in benchmark_statistics
* Update rocprofv3-benchmark.py
- use md5 instead of python hash (which includes random seed)
- use args.mysql_database
- compute md5sum of executable
- fix insert_benchmark_config
- marker trace fixes
- memory allocation fixes
- split hip_trace into hip_{runtime,compiler}_trace
- remove app_id from benchmarked_sdk
- support warmup runs
- count field in benchmark_statistics
* Support launcher and environment in YAML
* Update reproducible-dispatch-count.cpp
- support mode which doesn't use hip event timing
* Misc rocprofv3-benchmark.py updates
- fix some MySQL support
- remove some unnecessary logging
* support mysql db.
* Format.
* Updated SQL input files
- moved benchmark_schema.sql to benchmark_table.sql
- added benchmark_views.sql
- uses {{metric}} syntax for variable substitution
* cmake formatting
* update rocprofv3-benchmark.py
- benchmark config labels
- overhead views
* Encode rocprofv3-benchmark PID in rocprofv3 and timem output files
* Minor tweak to benchmark_views.sql
- include count
- reorder fields for readability
* split statements and use IS if values is NONE.
* use backtick instead of double quotes and add IS before NOT NULL.:
* Adding Mandelbrot Benchmark App
* Adding Dockerfile example
* Update dockerfile
* Update dockerfile
* [SDK] rocprofiler_query_external_correlation_id_request_kind_name
* Execution-profile benchmark mode
* Execution profile SQL support
* Rename mandlebrot folder + misc clang-tidy
* [rocprofv3-benchmark] Execution profile support
* Update installation
* add work dir when setting git revision, useful when building outside src.
* Set FULL_VERSION_STRING and ROCPROFILER_SDK_GIT_REVISION
- when benchmark folder is top-level
* Remove unused python packages from requirements.txt
* Use ldd/pyelftools to include linked libs for md5sum
- also add --filter-benchmark and --filter-rocprofv3 options
- support labeling the rocprofv3 options
- use more argparse groups
- more generic application of filters
- support variable substitution in environment, e.g. PATH=/some/path:$PATH
* Environment improvements
- improve reproducibility when env set via input file vs. shell
- support "environment-ignore" to remove environment variables
* Misc formatting
* Misc. fix
* use backticks for defining new columns name
* Support shuffling the order of benchmark modes/rocprofv3 args
* Address review comments
* Update Dockerfile
- rename to Dockerfile
- reduce to one layer
* Support docker build arg BRANCH
---------
Co-authored-by: Ammar ELWazir <aelwazir@amd.com>
Co-authored-by: Kandula, Venkateshwar reddy <Venkateshwarreddy.Kandula@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>
Co-authored-by: Madsen, Jonathan <Jonathan.Madsen@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
[ROCm/rocprofiler-sdk commit: 6f17da7ade]
Tento commit je obsažen v:
+2
-2
@@ -70,7 +70,7 @@ jobs:
|
||||
- name: Run clang-format
|
||||
run: |
|
||||
set +e
|
||||
FILES=$(find samples source tests -type f | egrep '\.(h|hpp|hh|c|cc|cpp)(|\.in)$')
|
||||
FILES=$(find samples source tests benchmark -type f | egrep '\.(h|hpp|hh|c|cc|cpp)(|\.in)$')
|
||||
FORMAT_OUT=$(clang-format-11 -i ${FILES})
|
||||
if [ $(git diff | wc -l) -ne 0 ]; then
|
||||
echo -e "\nError! Code not formatted. Run clang-format (version 11)...\n"
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
OUTFILE=missing_newline.txt
|
||||
for i in $(find source/lib source/include tests samples cmake -type f | egrep -v '\.bin$'); do VAL=$(tail -c 1 ${i}); if [ -n "${VAL}" ]; then echo "- ${i}" >> ${OUTFILE}; fi; done
|
||||
for i in $(find source tests samples benchmark docker cmake -type f | egrep -v '\.(bin|png|csv)$|source/docs/_(build|doxygen)'); do VAL=$(tail -c 1 ${i}); if [ -n "${VAL}" ]; then echo "- ${i}" >> ${OUTFILE}; fi; done
|
||||
if [[ -f ${OUTFILE} && $(cat ${OUTFILE} | wc -l) -gt 0 ]]; then
|
||||
echo -e "\nError! Source code missing new line at end of file...\n"
|
||||
echo -e "\nFiles:\n"
|
||||
|
||||
@@ -54,6 +54,7 @@ if(Git_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-parse HEAD
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE ROCPROFILER_SDK_GIT_REVISION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
|
||||
else()
|
||||
@@ -119,6 +120,10 @@ if(ROCPROFILER_BUILD_SAMPLES)
|
||||
add_subdirectory(samples)
|
||||
endif()
|
||||
|
||||
if(ROCPROFILER_BUILD_BENCHMARK)
|
||||
add_subdirectory(benchmark)
|
||||
endif()
|
||||
|
||||
include(rocprofiler_config_packaging)
|
||||
|
||||
rocprofiler_print_features()
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
# Exclude databases
|
||||
*.db
|
||||
*.db-journal
|
||||
/.rocprofv3/**
|
||||
|
||||
# Build directories
|
||||
/compile_commands.json
|
||||
/build*
|
||||
/.cache
|
||||
*.vscode
|
||||
@@ -0,0 +1,101 @@
|
||||
#
|
||||
# Integration tests
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(COMMAND rocprofiler_deactivate_clang_tidy)
|
||||
rocprofiler_deactivate_clang_tidy()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-sdk-benchmark LANGUAGES C CXX)
|
||||
|
||||
#
|
||||
# project options
|
||||
#
|
||||
option(ROCPROFILER_BENCHMARK_INSTALL_TIMEM "Install timem" ON)
|
||||
|
||||
#
|
||||
# cmake overrides
|
||||
#
|
||||
set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "benchmark")
|
||||
set(ROCPROFILER_SDK_BENCHMARK_SOURCE_DIR "${PROJECT_SOURCE_DIR}")
|
||||
set(ROCPROFILER_SDK_BENCHMARK_BINARY_DIR "${PROJECT_BINARY_DIR}")
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE
|
||||
"Release"
|
||||
CACHE STRING "" FORCE)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
|
||||
|
||||
enable_testing()
|
||||
include(CTest)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
# always use lib instead of lib64
|
||||
set(CMAKE_INSTALL_LIBDIR "lib")
|
||||
set(CMAKE_INSTALL_LIB64DIR "lib64")
|
||||
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}")
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
|
||||
set(CMAKE_DATAROOT_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}")
|
||||
|
||||
# define the library output directory
|
||||
if(PROJECT_IS_TOP_LEVEL)
|
||||
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../VERSION" FULL_VERSION_STRING LIMIT_COUNT 1)
|
||||
string(REGEX REPLACE "(\n|\r)" "" FULL_VERSION_STRING "${FULL_VERSION_STRING}")
|
||||
string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)(.*)" "\\1.\\2.\\3"
|
||||
ROCPROFILER_SDK_VERSION "${FULL_VERSION_STRING}")
|
||||
|
||||
find_package(Git)
|
||||
|
||||
if(Git_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/../.git")
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-parse HEAD
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/..
|
||||
OUTPUT_VARIABLE ROCPROFILER_SDK_GIT_REVISION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
|
||||
else()
|
||||
set(ROCPROFILER_SDK_GIT_REVISION "")
|
||||
endif()
|
||||
else()
|
||||
set(ROCPROFILER_BENCHMARK_INSTALL_PREFIX
|
||||
${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-sdk/benchmark
|
||||
CACHE PATH "relative install prefix for rocprofiler-sdk benchmark")
|
||||
set(_GNUInstallDirs_VARIABLES
|
||||
BINDIR
|
||||
SBINDIR
|
||||
LIBEXECDIR
|
||||
SYSCONFDIR
|
||||
SHAREDSTATEDIR
|
||||
LOCALSTATEDIR
|
||||
RUNSTATEDIR
|
||||
LIBDIR
|
||||
LIB64DIR # locally defined
|
||||
INCLUDEDIR
|
||||
OLDINCLUDEDIR
|
||||
DATAROOTDIR
|
||||
DATADIR
|
||||
INFODIR
|
||||
LOCALEDIR
|
||||
MANDIR
|
||||
DOCDIR)
|
||||
|
||||
foreach(_INSTALL_DIR ${_GNUInstallDirs_VARIABLES})
|
||||
set(CMAKE_INSTALL_${_INSTALL_DIR}
|
||||
${ROCPROFILER_BENCHMARK_INSTALL_PREFIX}/${CMAKE_INSTALL_${_INSTALL_DIR}})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# include the timem executable
|
||||
include(timem)
|
||||
|
||||
add_subdirectory(source)
|
||||
@@ -0,0 +1,21 @@
|
||||
# Benchmark Suite
|
||||
|
||||
## Generate Data
|
||||
|
||||
From the current directory:
|
||||
|
||||
```shell
|
||||
cmake -B build-benchmark .
|
||||
cd build-benchmark
|
||||
export PATH=${PWD}/bin:${PATH}
|
||||
rocprofv3-benchmark -i ./example.yml -n 2
|
||||
```
|
||||
|
||||
```shell
|
||||
sqlite3 benchmark.db
|
||||
```
|
||||
|
||||
```sql
|
||||
SELECT * FROM benchmark_metrics;
|
||||
SELECT * FROM benchmark_statistics;
|
||||
```
|
||||
@@ -0,0 +1,53 @@
|
||||
#
|
||||
# timem installation
|
||||
#
|
||||
|
||||
if(NOT ROCPROFILER_BENCHMARK_INSTALL_TIMEM)
|
||||
find_program(
|
||||
TIMEM_EXECUTABLE
|
||||
NAMES timem
|
||||
HINTS ${PROJECT_BINARY_DIR}
|
||||
PATHS ${PROJECT_BINARY_DIR}
|
||||
PATH_SUFFIXES bin)
|
||||
endif()
|
||||
|
||||
if(NOT TIMEM_EXECUTABLE OR NOT EXISTS "${TIMEM_EXECUTABLE}")
|
||||
set(TIMEM_INSTALLER
|
||||
${CMAKE_CURRENT_BINARY_DIR}/installer/timemory-timem-1.0.0-Linux.sh)
|
||||
find_program(SHELL_EXECUTABLE NAMES sh bash REQUIRED)
|
||||
|
||||
file(
|
||||
DOWNLOAD
|
||||
https://github.com/ROCm/timemory/releases/download/timemory-timem%2Fv0.0.4/timemory-timem-1.0.0-Linux.sh
|
||||
${TIMEM_INSTALLER}
|
||||
EXPECTED_MD5 63da7df7996a86d6d9ce312276c2f014
|
||||
INACTIVITY_TIMEOUT 30
|
||||
TIMEOUT 300
|
||||
SHOW_PROGRESS)
|
||||
|
||||
execute_process(
|
||||
COMMAND ${SHELL_EXECUTABLE} ${TIMEM_INSTALLER} --prefix=${PROJECT_BINARY_DIR}
|
||||
--exclude-subdir --skip-license
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
RESULT_VARIABLE _RET
|
||||
OUTPUT_VARIABLE _OUT
|
||||
ERROR_VARIABLE _ERR
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
if(NOT EXISTS ${PROJECT_BINARY_DIR}/bin/timem OR NOT _RET EQUAL 0)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"timem installation failed with exit code ${_RET}.\nSTDOUT:\n\t${_OUT}\nSTDERR:\n\t${_ERR}"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_program(
|
||||
TIMEM_EXECUTABLE
|
||||
NAMES timem REQUIRED
|
||||
HINTS ${PROJECT_BINARY_DIR}
|
||||
PATHS ${PROJECT_BINARY_DIR}
|
||||
PATH_SUFFIXES bin)
|
||||
|
||||
add_executable(rocprofiler-sdk::timem IMPORTED)
|
||||
set_property(TARGET rocprofiler-sdk::timem PROPERTY IMPORTED_LOCATION ${TIMEM_EXECUTABLE})
|
||||
@@ -0,0 +1,50 @@
|
||||
defaults:
|
||||
rocprofv3:
|
||||
# keys such as "runtime", "kernel", etc. below can be used with --filter-rocprofv3 option
|
||||
runtime: [--runtime-trace]
|
||||
sys: [--sys-trace]
|
||||
kernel: [--kernel-trace]
|
||||
hip: [--hip-trace]
|
||||
hsa: [--hsa-trace]
|
||||
memcpy: [--memory-copy-trace]
|
||||
malloc: [--memory-allocation-trace]
|
||||
scratch: [--scratch-memory-trace]
|
||||
counters: [--pmc, SQ_WAVES]
|
||||
benchmark:
|
||||
- baseline
|
||||
- disabled-sdk-contexts
|
||||
- sdk-buffer-overhead
|
||||
- sdk-callback-overhead
|
||||
- tool-runtime-overhead
|
||||
group:
|
||||
- examples
|
||||
environment:
|
||||
ROCR_VISIBLE_DEVICES: "0,2"
|
||||
HIP_VISIBLE_DEVICES: "0,2"
|
||||
|
||||
jobs:
|
||||
# explicitly specifies name, group, and command. inherits default rocprofv3 commands and benchmark modes
|
||||
- name: hip-in-libraries
|
||||
group: [examples, multigpu, multistream]
|
||||
command: [hip-in-libraries]
|
||||
|
||||
# explicitly specifies name, group, command, and rocprofv3. inherits default benchmark modes
|
||||
- name: transpose
|
||||
group: [examples, multithreaded, multigpu, multistream]
|
||||
command: [transpose, 4, 500, 10]
|
||||
launcher: [mpirun, -n, 1]
|
||||
rocprofv3:
|
||||
- [-r, --pmc, SQ_WAVES]
|
||||
- [--kernel-trace]
|
||||
- [--hip-trace]
|
||||
|
||||
# explicitly specifies name, command, rocprofv3, and benchmark. inherits default group(s)
|
||||
- name: hip-graph
|
||||
command: [hip-graph, 8, 500]
|
||||
rocprofv3:
|
||||
- [--kernel-trace]
|
||||
- [--hip-trace]
|
||||
- [--hsa-trace]
|
||||
benchmark:
|
||||
- baseline
|
||||
- disabled-sdk-contexts
|
||||
@@ -0,0 +1,32 @@
|
||||
defaults:
|
||||
rocprofv3:
|
||||
# keys are not provided so --filter-rocprofv3 option isn't supported for this YAML input
|
||||
- [--runtime-trace]
|
||||
- [--sys-trace]
|
||||
- [--kernel-trace]
|
||||
- [--hip-trace]
|
||||
- [--hsa-trace]
|
||||
- [--memory-copy-trace]
|
||||
- [--memory-allocation-trace]
|
||||
- [--scratch-memory-trace]
|
||||
- [--pmc, SQ_WAVES]
|
||||
benchmark:
|
||||
# these names can be used with --filter-benchmark option
|
||||
- baseline
|
||||
- disabled-sdk-contexts
|
||||
- sdk-buffer-overhead
|
||||
- sdk-callback-overhead
|
||||
- tool-runtime-overhead
|
||||
group:
|
||||
- examples
|
||||
|
||||
jobs:
|
||||
# explicitly specifies name, command, rocprofv3, and benchmark. inherits default group(s)
|
||||
- name: hip-graph
|
||||
command: [hip-graph, 8, 500]
|
||||
rocprofv3:
|
||||
- [--kernel-trace]
|
||||
- [--hip-trace]
|
||||
benchmark:
|
||||
- baseline
|
||||
- disabled-sdk-contexts
|
||||
@@ -0,0 +1,3 @@
|
||||
# these are optional, not required
|
||||
mysql-connector-python
|
||||
pyelftools
|
||||
@@ -0,0 +1,7 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
add_subdirectory(lib)
|
||||
add_subdirectory(bin)
|
||||
add_subdirectory(share)
|
||||
@@ -0,0 +1,24 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
# Adding main rocprofv3
|
||||
configure_file(rocprofv3-benchmark.py
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/rocprofv3-benchmark @ONLY)
|
||||
|
||||
install(
|
||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/rocprofv3-benchmark
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ
|
||||
WORLD_EXECUTE)
|
||||
|
||||
# downloaded timem
|
||||
install(
|
||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/timem
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
OPTIONAL)
|
||||
|
||||
# Adding Benchmark Workloads
|
||||
add_subdirectory(mandelbrot)
|
||||
@@ -0,0 +1,44 @@
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-sdk-benchmark-bin-mandelbrot LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(mandelbrot.cpp PROPERTIES LANGUAGE HIP)
|
||||
set_source_files_properties(utils.cpp PROPERTIES LANGUAGE HIP)
|
||||
|
||||
add_executable(mandelbrot)
|
||||
target_sources(mandelbrot PRIVATE mandelbrot.cpp utils.cpp)
|
||||
target_compile_options(mandelbrot PRIVATE -W -Wall -Wextra -Wpedantic -Werror
|
||||
-ffp-contract=fast)
|
||||
target_include_directories(mandelbrot PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
install(
|
||||
TARGETS mandelbrot
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
COMPONENT benchmark)
|
||||
@@ -0,0 +1,938 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#include <hip/hip_vector_types.h>
|
||||
#include <hip/math_functions.h>
|
||||
#include <omp.h>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
typedef struct
|
||||
{
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
coordRec coords[] = {
|
||||
{0.0, 0.0, 4.0}, // Whole set
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter)
|
||||
{
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float) (xPos + xStep * i);
|
||||
float y0 = (float) (yPos + yStep * j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for(iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++)
|
||||
{
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter)
|
||||
{
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float) (xPos + xStep * (float) i);
|
||||
float y0 = (float) (yPos + yStep * (float) j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
int stay;
|
||||
uint ccount = 0;
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
float savx = x;
|
||||
float savy = y;
|
||||
#ifdef FAST
|
||||
for(iter = 0; (iter < maxIter); iter += 16)
|
||||
{
|
||||
#else
|
||||
for(iter = 0; stay && (iter < maxIter); iter += 16)
|
||||
{
|
||||
#endif
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay * 16;
|
||||
#ifdef FAST
|
||||
if(!stay) break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if(!stay)
|
||||
{
|
||||
iter = 16;
|
||||
do
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
} while(stay && iter);
|
||||
}
|
||||
|
||||
out[tid] = (uint) ccount;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter)
|
||||
{
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double) (xPos + xStep * i);
|
||||
double y0 = (double) (yPos + yStep * j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
for(iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++)
|
||||
{
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, uint maxIter)
|
||||
{
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double) (xPos + xStep * (double) i);
|
||||
double y0 = (double) (yPos + yStep * (double) j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
int stay;
|
||||
uint ccount = 0;
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
double savx = x;
|
||||
double savy = y;
|
||||
#ifdef FAST
|
||||
for(iter = 0; (iter < maxIter); iter += 16)
|
||||
#else
|
||||
for(iter = 0; stay && (iter < maxIter); iter += 16)
|
||||
#endif
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay * 16;
|
||||
#ifdef FAST
|
||||
if(!stay) break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if(!stay)
|
||||
{
|
||||
iter = 16;
|
||||
do
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
} while(stay && iter);
|
||||
}
|
||||
out[tid] = (uint) ccount;
|
||||
};
|
||||
|
||||
// Commenting it out as it is not used anywhere in the code
|
||||
// static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
|
||||
|
||||
// Expected results for each kernel run at each coord
|
||||
unsigned long long expectedIters[] = {
|
||||
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull,
|
||||
203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull,
|
||||
203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull,
|
||||
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull,
|
||||
203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
|
||||
|
||||
class hipPerfMandelBrot
|
||||
{
|
||||
public:
|
||||
hipPerfMandelBrot();
|
||||
~hipPerfMandelBrot();
|
||||
|
||||
void setNumKernels(unsigned int num) { numKernels = num; }
|
||||
|
||||
unsigned int getNumKernels() const { return numKernels; }
|
||||
|
||||
void setNumStreams(unsigned int num) { numStreams = num; }
|
||||
unsigned int getNumStreams() const { return numStreams; }
|
||||
|
||||
void open(int deviceID);
|
||||
void run(unsigned int testCase, unsigned int deviceId);
|
||||
void printResults(void);
|
||||
|
||||
// array of funtion pointers
|
||||
typedef void (hipPerfMandelBrot::*funPtr)(uint* out,
|
||||
uint width,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
// Wrappers
|
||||
void float_mad(uint* out,
|
||||
uint width,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
void float_mandel_unroll(uint* out,
|
||||
uint width,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
void double_mad(uint* out,
|
||||
uint width,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
void double_mandel_unroll(uint* out,
|
||||
uint width,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
hipStream_t streams[2];
|
||||
|
||||
private:
|
||||
void setData(void* ptr, unsigned int value);
|
||||
void checkData(uint* ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
|
||||
std::map<std::string, std::vector<double>> results;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int maxIter;
|
||||
unsigned int coordIdx;
|
||||
volatile unsigned long long totalIters = 0;
|
||||
int numCUs;
|
||||
static const unsigned int numLoops = 10;
|
||||
};
|
||||
|
||||
hipPerfMandelBrot::hipPerfMandelBrot() = default;
|
||||
|
||||
hipPerfMandelBrot::~hipPerfMandelBrot() = default;
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::open(int deviceId)
|
||||
{
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
if(nGpu < 1)
|
||||
{
|
||||
failed("No GPU!");
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props = {};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus "
|
||||
<< "0x" << props.pciBusID << " " << props.name << " with "
|
||||
<< props.multiProcessorCount << " CUs"
|
||||
<< " and device id: " << deviceId << std::endl;
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::printResults()
|
||||
{
|
||||
// int numkernels = getNumKernels();
|
||||
int numStreams = getNumStreams();
|
||||
|
||||
std::cout << "\n"
|
||||
<< "Measured perf for kernels in GFLOPS on " << numStreams << " streams (s)"
|
||||
<< std::endl;
|
||||
|
||||
std::map<std::string, std::vector<double>>::iterator itr;
|
||||
for(itr = results.begin(); itr != results.end(); itr++)
|
||||
{
|
||||
std::cout << "\n" << std::setw(20) << itr->first << " ";
|
||||
for(auto i : results[itr->first])
|
||||
{
|
||||
std::cout << std::setw(10) << i << " ";
|
||||
}
|
||||
}
|
||||
results.clear();
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Wrappers for the kernel launches
|
||||
void
|
||||
hipPerfMandelBrot::float_mad(uint* out,
|
||||
uint /* width */,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt)
|
||||
{
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mad_kernel<float>,
|
||||
dim3(blocks),
|
||||
dim3(threads_per_block),
|
||||
0,
|
||||
streams[kernelCnt % streamCnt],
|
||||
out,
|
||||
width_,
|
||||
xPos,
|
||||
yPos,
|
||||
xStep,
|
||||
yStep,
|
||||
maxIter);
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::float_mandel_unroll(uint* out,
|
||||
uint /* width */,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt)
|
||||
{
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>,
|
||||
dim3(blocks),
|
||||
dim3(threads_per_block),
|
||||
0,
|
||||
streams[kernelCnt % streamCnt],
|
||||
out,
|
||||
width_,
|
||||
xPos,
|
||||
yPos,
|
||||
xStep,
|
||||
yStep,
|
||||
maxIter);
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::double_mad(uint* out,
|
||||
uint /* width */,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt)
|
||||
{
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(double_mad_kernel<double>,
|
||||
dim3(blocks),
|
||||
dim3(threads_per_block),
|
||||
0,
|
||||
streams[kernelCnt % streamCnt],
|
||||
out,
|
||||
width_,
|
||||
xPos,
|
||||
yPos,
|
||||
xStep,
|
||||
yStep,
|
||||
maxIter);
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::double_mandel_unroll(uint* out,
|
||||
uint /* width */,
|
||||
float xPos,
|
||||
float yPos,
|
||||
float xStep,
|
||||
float yStep,
|
||||
uint maxIter,
|
||||
hipStream_t* streams,
|
||||
int blocks,
|
||||
int threads_per_block,
|
||||
int kernelCnt)
|
||||
{
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>,
|
||||
dim3(blocks),
|
||||
dim3(threads_per_block),
|
||||
0,
|
||||
streams[kernelCnt % streamCnt],
|
||||
out,
|
||||
width_,
|
||||
xPos,
|
||||
yPos,
|
||||
xStep,
|
||||
yStep,
|
||||
maxIter);
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::run(unsigned int testCase, unsigned int /* deviceId */)
|
||||
{
|
||||
unsigned int numStreams = getNumStreams();
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
funPtr p[] = {&hipPerfMandelBrot::float_mad,
|
||||
&hipPerfMandelBrot::float_mandel_unroll,
|
||||
&hipPerfMandelBrot::double_mad,
|
||||
&hipPerfMandelBrot::double_mandel_unroll};
|
||||
|
||||
// Maximum iteration count
|
||||
maxIter = 32768;
|
||||
|
||||
// Variable-length arrays (VLAs) are not part of the C++ standard, using std::vector instead
|
||||
// uint * hPtr[numKernels];
|
||||
// uint * dPtr[numKernels];
|
||||
std::vector<uint*> hPtr(numKernels);
|
||||
std::vector<uint*> dPtr(numKernels);
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
for(uint i = 0; i < numStreams; i++)
|
||||
{
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
HIPCHECK(hipHostMalloc((void**) &hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint**) &dPtr[i], bufSize))
|
||||
}
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize / sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads / threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
// float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
// float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
// float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
// float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
// Synchronize to make sure all the copies are completed
|
||||
HIPCHECK(hipStreamSynchronize(nullptr));
|
||||
|
||||
int kernelIdx;
|
||||
if(testCase == 0 || testCase == 5 || testCase == 10)
|
||||
{
|
||||
kernelIdx = 0;
|
||||
}
|
||||
|
||||
else if(testCase == 1 || testCase == 6 || testCase == 11)
|
||||
{
|
||||
kernelIdx = 1;
|
||||
}
|
||||
else if(testCase == 2 || testCase == 7 || testCase == 12)
|
||||
{
|
||||
kernelIdx = 2;
|
||||
}
|
||||
else if(testCase == 3 || testCase == 8 || testCase == 13)
|
||||
{
|
||||
kernelIdx = 3;
|
||||
}
|
||||
|
||||
double totalTime = 0.0;
|
||||
|
||||
for(unsigned int k = 0; k < numLoops; k++)
|
||||
{
|
||||
if((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 ||
|
||||
testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12))
|
||||
{
|
||||
float xStep = (float) (coords[coordIdx].width / (double) width_);
|
||||
float yStep = (float) (-coords[coordIdx].width / (double) width_);
|
||||
float xPos = (float) (coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float) (coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
(this->*p[kernelIdx])(dPtr[i],
|
||||
width_,
|
||||
xPos,
|
||||
yPos,
|
||||
xStep,
|
||||
yStep,
|
||||
maxIter,
|
||||
streams,
|
||||
blocks,
|
||||
threads_per_block,
|
||||
i);
|
||||
}
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
double xStep = coords[coordIdx].width / (double) width_;
|
||||
double yStep = -coords[coordIdx].width / (double) width_;
|
||||
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
|
||||
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
(this->*p[kernelIdx])(dPtr[i],
|
||||
width_,
|
||||
xPos,
|
||||
yPos,
|
||||
xStep,
|
||||
yStep,
|
||||
maxIter,
|
||||
streams,
|
||||
blocks,
|
||||
threads_per_block,
|
||||
i);
|
||||
}
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
}
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
checkData(hPtr[i]);
|
||||
|
||||
int j = 0;
|
||||
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30)
|
||||
{
|
||||
j++;
|
||||
}
|
||||
|
||||
if(j == 30)
|
||||
{
|
||||
std::cout << "Incorrect iteration count detected. ";
|
||||
}
|
||||
}
|
||||
|
||||
// Compute GFLOPS. There are 7 FLOPs per iteration
|
||||
double perf = ((double) (totalIters * numKernels) * 7 * (double) (1e-09)) /
|
||||
(totalTime / (double) numLoops);
|
||||
|
||||
std::vector<std::string> kernelName = {"float", "float_unroll", "double", "double_unroll"};
|
||||
|
||||
// Print results except for Warm-up kernel
|
||||
if(testCase != 100)
|
||||
{
|
||||
results[kernelName[testCase % 4]].push_back(perf);
|
||||
}
|
||||
|
||||
for(uint i = 0; i < numStreams; i++)
|
||||
{
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
// Free host and device memory
|
||||
for(uint i = 0; i < numKernels; i++)
|
||||
{
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::setData(void* ptr, unsigned int value)
|
||||
{
|
||||
unsigned int* ptr2 = (unsigned int*) ptr;
|
||||
for(unsigned int i = 0; i < width_ * width_; i++)
|
||||
{
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
hipPerfMandelBrot::checkData(uint* ptr)
|
||||
{
|
||||
totalIters = 0;
|
||||
for(unsigned int i = 0; i < width_ * width_; i++)
|
||||
{
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char* argv[])
|
||||
{
|
||||
// Default values for kernels and streams
|
||||
unsigned int numStreamsWarmup = 1, numKernelsWarmup = 1;
|
||||
unsigned int numStreamsSync = 1, numKernelsSync = 1;
|
||||
unsigned int numStreamsAsync = 2, numKernelsAsync = 2;
|
||||
|
||||
// Check for help arguments
|
||||
if(argc > 1 && (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help" ||
|
||||
std::string(argv[1]) == "help"))
|
||||
{
|
||||
std::cout << "Usage: " << argv[0]
|
||||
<< " [--warmup <numStreams>:<numKernels>] [--sync <numStreams>:<numKernels>] "
|
||||
"[--async <numStreams>:<numKernels>]"
|
||||
<< std::endl;
|
||||
std::cout << "Example: " << argv[0] << " --warmup 1:1 --sync 2:4 --async 3:6" << std::endl;
|
||||
std::exit(EXIT_SUCCESS); // Exit with success status
|
||||
}
|
||||
|
||||
// Parse command-line arguments
|
||||
for(int i = 1; i < argc; i++)
|
||||
{
|
||||
std::string arg = argv[i];
|
||||
if(arg == "--warmup" && i + 1 < argc)
|
||||
{
|
||||
std::string value = argv[++i];
|
||||
std::stringstream ss(value);
|
||||
char delimiter;
|
||||
ss >> numStreamsWarmup >> delimiter >> numKernelsWarmup;
|
||||
if(delimiter != ':' || ss.fail())
|
||||
{
|
||||
std::cerr << "Invalid format for --warmup. Expected <numStreams>:<numKernels>."
|
||||
<< std::endl;
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
else if(arg == "--sync" && i + 1 < argc)
|
||||
{
|
||||
std::string value = argv[++i];
|
||||
std::stringstream ss(value);
|
||||
char delimiter;
|
||||
ss >> numStreamsSync >> delimiter >> numKernelsSync;
|
||||
if(delimiter != ':' || ss.fail())
|
||||
{
|
||||
std::cerr << "Invalid format for --sync. Expected <numStreams>:<numKernels>."
|
||||
<< std::endl;
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
else if(arg == "--async" && i + 1 < argc)
|
||||
{
|
||||
std::string value = argv[++i];
|
||||
std::stringstream ss(value);
|
||||
char delimiter;
|
||||
ss >> numStreamsAsync >> delimiter >> numKernelsAsync;
|
||||
if(delimiter != ':' || ss.fail())
|
||||
{
|
||||
std::cerr << "Invalid format for --async. Expected <numStreams>:<numKernels>."
|
||||
<< std::endl;
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << "Unknown argument: " << arg << std::endl;
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
int deviceCount = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&deviceCount)); // Get the number of devices
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int deviceId = 0; deviceId < deviceCount; deviceId++)
|
||||
{
|
||||
hipPerfMandelBrot mandelbrotCompute;
|
||||
mandelbrotCompute.open(deviceId);
|
||||
|
||||
for(unsigned int testCase = 0; testCase < 3; testCase++)
|
||||
{
|
||||
switch(testCase)
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
// Warmup-kernel - default stream executes serially
|
||||
mandelbrotCompute.setNumStreams(numStreamsWarmup);
|
||||
mandelbrotCompute.setNumKernels(numKernelsWarmup);
|
||||
mandelbrotCompute.run(100 /*Random number*/, deviceId);
|
||||
break;
|
||||
}
|
||||
|
||||
case 1:
|
||||
{
|
||||
// run all - sync
|
||||
int i = 0;
|
||||
do
|
||||
{
|
||||
mandelbrotCompute.setNumStreams(numStreamsSync);
|
||||
mandelbrotCompute.setNumKernels(numKernelsSync);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
} while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
// run all - async
|
||||
int i = 0;
|
||||
do
|
||||
{
|
||||
mandelbrotCompute.setNumStreams(numStreamsAsync);
|
||||
mandelbrotCompute.setNumKernels(numKernelsAsync);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
} while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,324 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#include <thread>
|
||||
#ifdef __linux__
|
||||
# include <sys/sysinfo.h>
|
||||
#elif defined(_WIN32)
|
||||
# include <windows.h>
|
||||
#endif
|
||||
|
||||
// standard global variables that can be set on command line
|
||||
size_t N = 4 * 1024 * 1024;
|
||||
char memsetval = 0x42;
|
||||
int memsetD32val = 0xDEADBEEF;
|
||||
short memsetD16val = 0xDEAD;
|
||||
char memsetD8val = 0xDE;
|
||||
int iterations = 1;
|
||||
unsigned blocksPerCU = 6; // to hide latency
|
||||
unsigned threadsPerBlock = 256;
|
||||
int textureFilterMode = 0; // 0: hipFilterModePoint; 1: hipFilterModeLinear
|
||||
int p_gpuDevice = 0;
|
||||
unsigned p_verbose = 0;
|
||||
int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/
|
||||
int debug_test = 0;
|
||||
#ifdef _WIN64
|
||||
const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES=";
|
||||
const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES=";
|
||||
const char* PATH_SEPERATOR_STR = "\\";
|
||||
const char* NULL_DEVICE = "NUL:";
|
||||
#else
|
||||
const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES";
|
||||
const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES";
|
||||
const char* PATH_SEPERATOR_STR = "/";
|
||||
const char* NULL_DEVICE = "/dev/null";
|
||||
#endif
|
||||
|
||||
#ifdef _WIN64
|
||||
// Windows does not have rand_r, use srand and rand instead.
|
||||
int
|
||||
rand_r(unsigned int* s)
|
||||
{
|
||||
srand(*s);
|
||||
return rand();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Get Free Memory from the system
|
||||
static size_t
|
||||
getMemoryAmount()
|
||||
{
|
||||
#if __linux__
|
||||
struct sysinfo info;
|
||||
int _ = sysinfo(&info);
|
||||
return info.freeram / (1024 * 1024); // MB
|
||||
#elif defined(_WIN32)
|
||||
MEMORYSTATUSEX statex;
|
||||
statex.dwLength = sizeof(statex);
|
||||
GlobalMemoryStatusEx(&statex);
|
||||
return (statex.ullAvailPhys / (1024 * 1024)); // MB
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t
|
||||
getHostThreadCount(const size_t memPerThread, const size_t maxThreads)
|
||||
{
|
||||
if(memPerThread == 0) return 0;
|
||||
auto memAmount = getMemoryAmount();
|
||||
const auto processor_count = std::thread::hardware_concurrency();
|
||||
if(processor_count == 0 || memAmount == 0) return 0;
|
||||
size_t thread_count = 0;
|
||||
if((processor_count * memPerThread) < memAmount)
|
||||
thread_count = processor_count;
|
||||
else
|
||||
thread_count = reinterpret_cast<size_t>(memAmount / memPerThread);
|
||||
if(maxThreads > 0)
|
||||
{
|
||||
return (thread_count > maxThreads) ? maxThreads : thread_count;
|
||||
}
|
||||
return thread_count;
|
||||
}
|
||||
|
||||
// Function to determine if the device is of gfx11 architecture
|
||||
bool
|
||||
IsGfx11()
|
||||
{
|
||||
#if defined(__HIP_PLATFORM_NVIDIA__)
|
||||
return false;
|
||||
#elif defined(__HIP_PLATFORM_AMD__)
|
||||
int device = -1;
|
||||
hipDeviceProp_t props{};
|
||||
HIPCHECK(hipGetDevice(&device));
|
||||
HIPCHECK(hipGetDeviceProperties(&props, device));
|
||||
|
||||
// Get GCN Arch Name and compare to check if it is gfx11
|
||||
std::string arch = std::string(props.gcnArchName);
|
||||
auto pos = arch.find(":");
|
||||
if(pos != std::string::npos) arch = arch.substr(0, pos);
|
||||
|
||||
if(arch.size() >= 5) arch = arch.substr(0, 5);
|
||||
|
||||
return (arch == std::string("gfx11")) ? true : false;
|
||||
#else
|
||||
std::cout << "Have to be either Nvidia or AMD platform, asserting" << std::endl;
|
||||
assert(false);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace HipTest
|
||||
{
|
||||
double
|
||||
elapsed_time(long long startTimeUs, long long stopTimeUs)
|
||||
{
|
||||
return ((double) (stopTimeUs - startTimeUs)) / ((double) (1000));
|
||||
}
|
||||
|
||||
int
|
||||
parseSize(const char* str, size_t* output)
|
||||
{
|
||||
char* next;
|
||||
*output = strtoull(str, &next, 0);
|
||||
int l = strlen(str);
|
||||
if(l)
|
||||
{
|
||||
char c = str[l - 1]; // last char.
|
||||
if((c == 'k') || (c == 'K'))
|
||||
{
|
||||
*output *= 1024;
|
||||
}
|
||||
if((c == 'm') || (c == 'M'))
|
||||
{
|
||||
*output *= (1024 * 1024);
|
||||
}
|
||||
if((c == 'g') || (c == 'G'))
|
||||
{
|
||||
*output *= (1024 * 1024 * 1024);
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
parseUInt(const char* str, unsigned int* output)
|
||||
{
|
||||
char* next;
|
||||
*output = strtoul(str, &next, 0);
|
||||
return !strlen(next);
|
||||
}
|
||||
|
||||
int
|
||||
parseInt(const char* str, int* output)
|
||||
{
|
||||
char* next;
|
||||
*output = strtol(str, &next, 0);
|
||||
return !strlen(next);
|
||||
}
|
||||
|
||||
int
|
||||
parseStandardArguments(int argc, char* argv[], bool failOnUndefinedArg)
|
||||
{
|
||||
int extraArgs = 1;
|
||||
for(int i = 1; i < argc; i++)
|
||||
{
|
||||
const char* arg = argv[i];
|
||||
|
||||
if(!strcmp(arg, " "))
|
||||
{
|
||||
// skip NULL args.
|
||||
}
|
||||
else if(!strcmp(arg, "--N") || (!strcmp(arg, "-N")))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseSize(argv[i], &N))
|
||||
{
|
||||
failed("Bad N size argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--threadsPerBlock"))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseUInt(argv[i], &threadsPerBlock))
|
||||
{
|
||||
failed("Bad threadsPerBlock argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--blocksPerCU"))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseUInt(argv[i], &blocksPerCU))
|
||||
{
|
||||
failed("Bad blocksPerCU argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--memsetval"))
|
||||
{
|
||||
int ex;
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
||||
{
|
||||
failed("Bad memsetval argument");
|
||||
}
|
||||
memsetval = ex;
|
||||
}
|
||||
else if(!strcmp(arg, "--memsetD32val"))
|
||||
{
|
||||
int ex;
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
||||
{
|
||||
failed("Bad memsetD32val argument");
|
||||
}
|
||||
memsetD32val = ex;
|
||||
}
|
||||
else if(!strcmp(arg, "--memsetD16val"))
|
||||
{
|
||||
int ex;
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
||||
{
|
||||
failed("Bad memsetD16val argument");
|
||||
}
|
||||
memsetD16val = ex;
|
||||
}
|
||||
else if(!strcmp(arg, "--memsetD8val"))
|
||||
{
|
||||
int ex;
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
||||
{
|
||||
failed("Bad memsetD8val argument");
|
||||
}
|
||||
memsetD8val = ex;
|
||||
}
|
||||
else if(!strcmp(arg, "--textureFilterMode"))
|
||||
{
|
||||
int mode;
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &mode))
|
||||
{
|
||||
failed("Bad textureFilterMode argument");
|
||||
}
|
||||
textureFilterMode = mode;
|
||||
}
|
||||
else if(!strcmp(arg, "--iterations") || (!strcmp(arg, "-i")))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &iterations))
|
||||
{
|
||||
failed("Bad iterations argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--gpu") || (!strcmp(arg, "-gpuDevice")) || (!strcmp(arg, "-g")))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &p_gpuDevice))
|
||||
{
|
||||
failed("Bad gpuDevice argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--verbose") || (!strcmp(arg, "-v")))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose))
|
||||
{
|
||||
failed("Bad verbose argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--tests") || (!strcmp(arg, "-t")))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &p_tests))
|
||||
{
|
||||
failed("Bad tests argument");
|
||||
}
|
||||
}
|
||||
else if(!strcmp(arg, "--debug") || (!strcmp(arg, "-d")))
|
||||
{
|
||||
if(++i >= argc || !HipTest::parseInt(argv[i], &debug_test))
|
||||
{
|
||||
failed("Bad tests argument");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(failOnUndefinedArg)
|
||||
{
|
||||
failed("Bad argument '%s'", arg);
|
||||
}
|
||||
else
|
||||
{
|
||||
argv[extraArgs++] = argv[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return extraArgs;
|
||||
}
|
||||
|
||||
unsigned
|
||||
setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
|
||||
{
|
||||
int device;
|
||||
HIPCHECK(hipGetDevice(&device));
|
||||
hipDeviceProp_t props;
|
||||
HIPCHECK(hipGetDeviceProperties(&props, device));
|
||||
|
||||
unsigned blocks = props.multiProcessorCount * blocksPerCU;
|
||||
if(blocks * threadsPerBlock > N)
|
||||
{
|
||||
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
}
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
} // namespace HipTest
|
||||
@@ -0,0 +1,733 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* File is intended to C and CPP compliant hence any CPP specic changes
|
||||
* should be added into CPP section
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
# include <iomanip>
|
||||
# include <iostream>
|
||||
# if __CUDACC__
|
||||
# include <sys/time.h>
|
||||
# else
|
||||
# include <chrono>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// ************************ GCC section **************************
|
||||
#include <stddef.h>
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_runtime_api.h"
|
||||
|
||||
#define HC __attribute__((hc))
|
||||
|
||||
#define KNRM "\x1B[0m"
|
||||
#define KRED "\x1B[31m"
|
||||
#define KGRN "\x1B[32m"
|
||||
#define KYEL "\x1B[33m"
|
||||
#define KBLU "\x1B[34m"
|
||||
#define KMAG "\x1B[35m"
|
||||
#define KCYN "\x1B[36m"
|
||||
#define KWHT "\x1B[37m"
|
||||
|
||||
// HIP Skip Return code set at cmake
|
||||
#define HIP_SKIP_RETURN_CODE 127
|
||||
#define HIP_ENABLE_SKIP_TESTS 0
|
||||
|
||||
// Recommended thresholds for Tests
|
||||
#define MAX_THREADS 100
|
||||
|
||||
inline bool
|
||||
hip_skip_tests_enabled()
|
||||
{
|
||||
return HIP_ENABLE_SKIP_TESTS;
|
||||
}
|
||||
|
||||
inline int
|
||||
hip_skip_retcode()
|
||||
{
|
||||
// HIP Skip Return code set at cmake
|
||||
return HIP_SKIP_RETURN_CODE;
|
||||
}
|
||||
|
||||
// This must be called in the end of main() to indicate test passed with success.
|
||||
// If it's called somewhere else, compiling issues or unexpected result will arise.
|
||||
#define passed() \
|
||||
printf("%sPASSED!%s\n", KGRN, KNRM); \
|
||||
return 0;
|
||||
|
||||
// The real "assert" would have written to stderr. But it is
|
||||
// sufficient to just fflush here without getting pedantic. This also
|
||||
// ensures that we don't lose any earlier writes to stdout.
|
||||
#define failed(...) \
|
||||
printf("%serror: ", KRED); \
|
||||
printf(__VA_ARGS__); \
|
||||
printf("\n"); \
|
||||
printf("error: TEST FAILED\n%s", KNRM); \
|
||||
fflush(NULL); \
|
||||
abort();
|
||||
|
||||
#define warn(...) \
|
||||
printf("%swarn: ", KYEL); \
|
||||
printf(__VA_ARGS__); \
|
||||
printf("\n"); \
|
||||
printf("warn: TEST WARNING\n%s", KNRM);
|
||||
|
||||
#define HIP_PRINT_STATUS(status) \
|
||||
std::cout << hipGetErrorName(status) << " at line: " << __LINE__ << std::endl;
|
||||
|
||||
#define HIPCHECK(error) \
|
||||
{ \
|
||||
hipError_t localError = error; \
|
||||
if((localError != hipSuccess) && (localError != hipErrorPeerAccessAlreadyEnabled)) \
|
||||
{ \
|
||||
printf("%serror: '%s'(%d) from %s at %s:%d%s\n", \
|
||||
KRED, \
|
||||
hipGetErrorString(localError), \
|
||||
localError, \
|
||||
#error, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
KNRM); \
|
||||
failed("API returned error code."); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define HIPASSERT(condition) \
|
||||
if(!(condition)) \
|
||||
{ \
|
||||
failed("%sassertion %s at %s:%d%s \n", KRED, #condition, __FILE__, __LINE__, KNRM); \
|
||||
}
|
||||
|
||||
#define HIPCHECK_API(API_CALL, EXPECTED_ERROR) \
|
||||
{ \
|
||||
hipError_t _e = (API_CALL); \
|
||||
if(_e != (EXPECTED_ERROR)) \
|
||||
{ \
|
||||
failed("%sAPI '%s' returned %d(%s) but test expected %d(%s) at %s:%d%s \n", \
|
||||
KRED, \
|
||||
#API_CALL, \
|
||||
_e, \
|
||||
hipGetErrorName(_e), \
|
||||
EXPECTED_ERROR, \
|
||||
hipGetErrorName(EXPECTED_ERROR), \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
KNRM); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define HIPCHECK_RETURN_ONFAIL(func) \
|
||||
do \
|
||||
{ \
|
||||
hipError_t herror = (func); \
|
||||
if(herror != hipSuccess) \
|
||||
{ \
|
||||
return herror; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
#ifdef _WIN64
|
||||
# include <tchar.h>
|
||||
# define aligned_alloc(x, y) _aligned_malloc(y, x)
|
||||
# define aligned_free(x) _aligned_free(x)
|
||||
# define popen(x, y) _popen(x, y)
|
||||
# define pclose(x) _pclose(x)
|
||||
# define setenv(x, y, z) _putenv_s(x, y)
|
||||
# define unsetenv _putenv
|
||||
# define fileno(x) _fileno(x)
|
||||
# define dup(x) _dup(x)
|
||||
# define dup2(x, y) _dup2(x, y)
|
||||
# define pipe(x, y, z) _pipe(x, y, z)
|
||||
# define sleep(x) _sleep(x)
|
||||
#else
|
||||
# define aligned_free(x) free(x)
|
||||
#endif
|
||||
|
||||
// standard command-line variables:
|
||||
extern size_t N;
|
||||
extern char memsetval;
|
||||
extern int memsetD32val;
|
||||
extern short memsetD16val;
|
||||
extern char memsetD8val;
|
||||
extern int iterations;
|
||||
extern unsigned blocksPerCU;
|
||||
extern unsigned threadsPerBlock;
|
||||
extern int textureFilterMode;
|
||||
extern int p_gpuDevice;
|
||||
extern unsigned p_verbose;
|
||||
extern int p_tests;
|
||||
extern int debug_test;
|
||||
extern const char* HIP_VISIBLE_DEVICES_STR;
|
||||
extern const char* CUDA_VISIBLE_DEVICES_STR;
|
||||
extern const char* PATH_SEPERATOR_STR;
|
||||
extern const char* NULL_DEVICE;
|
||||
|
||||
// ********************* CPP section *********************
|
||||
#ifdef __cplusplus
|
||||
|
||||
# ifdef __HIP_PLATFORM_HCC
|
||||
# define TYPENAME(T) typeid(T).name()
|
||||
# else
|
||||
# define TYPENAME(T) "?"
|
||||
# endif
|
||||
|
||||
# ifdef _WIN64
|
||||
int
|
||||
rand_r(unsigned int* s);
|
||||
# endif
|
||||
|
||||
// Get Optimal Thread count size
|
||||
size_t
|
||||
getHostThreadCount(const size_t memPerThread = 200 /* MB */, const size_t maxThreads = 0);
|
||||
|
||||
namespace HipTest
|
||||
{
|
||||
// Returns the current system time in microseconds
|
||||
inline long long
|
||||
get_time()
|
||||
{
|
||||
# if __CUDACC__
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, 0);
|
||||
return (tv.tv_sec * 1000000) + tv.tv_usec;
|
||||
# else
|
||||
return std::chrono::high_resolution_clock::now().time_since_epoch() /
|
||||
std::chrono::microseconds(1);
|
||||
# endif
|
||||
}
|
||||
|
||||
double
|
||||
elapsed_time(long long startTimeUs, long long stopTimeUs);
|
||||
|
||||
int
|
||||
parseSize(const char* str, size_t* output);
|
||||
int
|
||||
parseUInt(const char* str, unsigned int* output);
|
||||
int
|
||||
parseInt(const char* str, int* output);
|
||||
int
|
||||
parseStandardArguments(int argc, char* argv[], bool failOnUndefinedArg);
|
||||
|
||||
unsigned
|
||||
setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N);
|
||||
|
||||
template <typename T> // pointer type
|
||||
void
|
||||
checkArray(T hData, T hOutputData, size_t width, size_t height, size_t depth)
|
||||
{
|
||||
for([[maybe_unused]] size_t i = 0; i < depth; i++)
|
||||
{
|
||||
for([[maybe_unused]] size_t j = 0; j < height; j++)
|
||||
{
|
||||
for([[maybe_unused]] size_t k = 0; k < width; k++)
|
||||
{
|
||||
int offset = i * width * height + j * width + k;
|
||||
if(hData[offset] != hOutputData[offset])
|
||||
{
|
||||
std::cerr << '[' << i << ',' << j << ',' << k << "]:" << hData[offset] << "----"
|
||||
<< hOutputData[offset] << " ";
|
||||
failed("mistmatch at:%d %d %d", i, j, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
checkArray(T input, T output, size_t height, size_t width)
|
||||
{
|
||||
for(size_t i = 0; i < height; i++)
|
||||
{
|
||||
for(size_t j = 0; j < width; j++)
|
||||
{
|
||||
int offset = i * width + j;
|
||||
if(input[offset] != output[offset])
|
||||
{
|
||||
std::cerr << '[' << i << ',' << j << ',' << "]:" << input[offset] << "----"
|
||||
<< output[offset] << " ";
|
||||
failed("mistmatch at:%d %d", i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for(size_t i = offset; i < NELEM; i += stride)
|
||||
{
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vectorADDReverse(const T* A_d, const T* B_d, T* C_d, size_t NELEM)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for(int64_t i = NELEM - stride + offset; i >= 0; i -= stride)
|
||||
{
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
addCount(const T* A_d, T* C_d, size_t NELEM, int count)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for(int i = 0; i < count; i++)
|
||||
{
|
||||
for(size_t i = offset; i < NELEM; i += stride)
|
||||
{
|
||||
C_d[i] = A_d[i] + (T) count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
addCountReverse(const T* A_d, T* C_d, int64_t NELEM, int count)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for(int i = 0; i < count; i++)
|
||||
{
|
||||
for(int64_t i = NELEM - stride + offset; i >= 0; i -= stride)
|
||||
{
|
||||
C_d[i] = A_d[i] + (T) count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
memsetReverse(T* C_d, T val, int64_t NELEM)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for(int64_t i = NELEM - stride + offset; i >= 0; i -= stride)
|
||||
{
|
||||
C_d[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
setDefaultData(size_t numElements, T* A_h, T* B_h, T* C_h)
|
||||
{
|
||||
// Initialize the host data:
|
||||
for(size_t i = 0; i < numElements; i++)
|
||||
{
|
||||
if(A_h) (A_h)[i] = 3.146f + i; // Pi
|
||||
if(B_h) (B_h)[i] = 1.618f + i; // Phi
|
||||
if(C_h) (C_h)[i] = 0.0f + i;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
initArraysForHost(T** A_h, T** B_h, T** C_h, size_t N, bool usePinnedHost = false)
|
||||
{
|
||||
size_t Nbytes = N * sizeof(T);
|
||||
|
||||
if(usePinnedHost)
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(A_h), Nbytes));
|
||||
}
|
||||
if(B_h)
|
||||
{
|
||||
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(B_h), Nbytes));
|
||||
}
|
||||
if(C_h)
|
||||
{
|
||||
HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(C_h), Nbytes));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
*A_h = (T*) malloc(Nbytes);
|
||||
HIPASSERT(*A_h != NULL);
|
||||
}
|
||||
|
||||
if(B_h)
|
||||
{
|
||||
*B_h = (T*) malloc(Nbytes);
|
||||
HIPASSERT(*B_h != NULL);
|
||||
}
|
||||
|
||||
if(C_h)
|
||||
{
|
||||
*C_h = (T*) malloc(Nbytes);
|
||||
HIPASSERT(*C_h != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
initArrays(T** A_d,
|
||||
T** B_d,
|
||||
T** C_d,
|
||||
T** A_h,
|
||||
T** B_h,
|
||||
T** C_h,
|
||||
size_t N,
|
||||
bool usePinnedHost = false)
|
||||
{
|
||||
size_t Nbytes = N * sizeof(T);
|
||||
|
||||
if(A_d)
|
||||
{
|
||||
HIPCHECK(hipMalloc(A_d, Nbytes));
|
||||
}
|
||||
if(B_d)
|
||||
{
|
||||
HIPCHECK(hipMalloc(B_d, Nbytes));
|
||||
}
|
||||
if(C_d)
|
||||
{
|
||||
HIPCHECK(hipMalloc(C_d, Nbytes));
|
||||
}
|
||||
|
||||
initArraysForHost(A_h, B_h, C_h, N, usePinnedHost);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
freeArraysForHost(T* A_h, T* B_h, T* C_h, bool usePinnedHost)
|
||||
{
|
||||
if(usePinnedHost)
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
HIPCHECK(hipHostFree(A_h));
|
||||
}
|
||||
if(B_h)
|
||||
{
|
||||
HIPCHECK(hipHostFree(B_h));
|
||||
}
|
||||
if(C_h)
|
||||
{
|
||||
HIPCHECK(hipHostFree(C_h));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
free(A_h);
|
||||
}
|
||||
if(B_h)
|
||||
{
|
||||
free(B_h);
|
||||
}
|
||||
if(C_h)
|
||||
{
|
||||
free(C_h);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
freeArrays(T* A_d, T* B_d, T* C_d, T* A_h, T* B_h, T* C_h, bool usePinnedHost)
|
||||
{
|
||||
if(A_d)
|
||||
{
|
||||
HIPCHECK(hipFree(A_d));
|
||||
}
|
||||
if(B_d)
|
||||
{
|
||||
HIPCHECK(hipFree(B_d));
|
||||
}
|
||||
if(C_d)
|
||||
{
|
||||
HIPCHECK(hipFree(C_d));
|
||||
}
|
||||
|
||||
freeArraysForHost(A_h, B_h, C_h, usePinnedHost);
|
||||
}
|
||||
|
||||
# if defined(__HIP_PLATFORM_AMD__)
|
||||
template <typename T>
|
||||
void
|
||||
initArrays2DPitch(T** A_d,
|
||||
T** B_d,
|
||||
T** C_d,
|
||||
size_t* pitch_A,
|
||||
size_t* pitch_B,
|
||||
size_t* pitch_C,
|
||||
size_t numW,
|
||||
size_t numH)
|
||||
{
|
||||
if(A_d)
|
||||
{
|
||||
HIPCHECK(hipMallocPitch((void**) A_d, pitch_A, numW * sizeof(T), numH));
|
||||
}
|
||||
if(B_d)
|
||||
{
|
||||
HIPCHECK(hipMallocPitch((void**) B_d, pitch_B, numW * sizeof(T), numH));
|
||||
}
|
||||
if(C_d)
|
||||
{
|
||||
HIPCHECK(hipMallocPitch((void**) C_d, pitch_C, numW * sizeof(T), numH));
|
||||
}
|
||||
|
||||
HIPASSERT(*pitch_A == *pitch_B);
|
||||
HIPASSERT(*pitch_A == *pitch_C)
|
||||
}
|
||||
|
||||
inline void
|
||||
initHIPArrays(hipArray** A_d,
|
||||
hipArray** B_d,
|
||||
hipArray** C_d,
|
||||
const hipChannelFormatDesc* desc,
|
||||
const size_t numW,
|
||||
const size_t numH,
|
||||
const unsigned int flags)
|
||||
{
|
||||
if(A_d)
|
||||
{
|
||||
HIPCHECK(hipMallocArray(A_d, desc, numW, numH, flags));
|
||||
}
|
||||
if(B_d)
|
||||
{
|
||||
HIPCHECK(hipMallocArray(B_d, desc, numW, numH, flags));
|
||||
}
|
||||
if(C_d)
|
||||
{
|
||||
HIPCHECK(hipMallocArray(C_d, desc, numW, numH, flags));
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
// Assumes C_h contains vector add of A_h + B_h
|
||||
// Calls the test "failed" macro if a mismatch is detected.
|
||||
template <typename T>
|
||||
size_t
|
||||
checkVectorADD(T* A_h,
|
||||
T* B_h,
|
||||
T* result_H,
|
||||
size_t N,
|
||||
bool expectMatch = true,
|
||||
bool reportMismatch = true)
|
||||
{
|
||||
size_t mismatchCount = 0;
|
||||
size_t firstMismatch = 0;
|
||||
size_t mismatchesToPrint = 10;
|
||||
for(size_t i = 0; i < N; i++)
|
||||
{
|
||||
T expected = A_h[i] + B_h[i];
|
||||
if(result_H[i] != expected)
|
||||
{
|
||||
if(mismatchCount == 0)
|
||||
{
|
||||
firstMismatch = i;
|
||||
}
|
||||
mismatchCount++;
|
||||
if((mismatchCount <= mismatchesToPrint) && expectMatch)
|
||||
{
|
||||
std::cout << std::fixed << std::setprecision(32);
|
||||
std::cout << "At " << i << std::endl;
|
||||
std::cout << " Computed:" << result_H[i] << std::endl;
|
||||
std::cout << " Expected:" << expected << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(reportMismatch)
|
||||
{
|
||||
if(expectMatch)
|
||||
{
|
||||
if(mismatchCount)
|
||||
{
|
||||
failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(mismatchCount == 0)
|
||||
{
|
||||
failed("expected mismatches but did not detect any!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mismatchCount;
|
||||
}
|
||||
|
||||
// Assumes C_h contains vector add of A_h + B_h
|
||||
// Calls the test "failed" macro if a mismatch is detected.
|
||||
template <typename T>
|
||||
void
|
||||
checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch = true)
|
||||
{
|
||||
size_t mismatchCount = 0;
|
||||
size_t firstMismatch = 0;
|
||||
size_t mismatchesToPrint = 10;
|
||||
for(size_t i = 0; i < N; i++)
|
||||
{
|
||||
if(result_H[i] != expected_H[i])
|
||||
{
|
||||
if(mismatchCount == 0)
|
||||
{
|
||||
firstMismatch = i;
|
||||
}
|
||||
mismatchCount++;
|
||||
if((mismatchCount <= mismatchesToPrint) && expectMatch)
|
||||
{
|
||||
std::cout << std::fixed << std::setprecision(32);
|
||||
std::cout << "At " << i << std::endl;
|
||||
std::cout << " Computed:" << result_H[i] << std::endl;
|
||||
std::cout << " Expected:" << expected_H[i] << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(expectMatch)
|
||||
{
|
||||
if(mismatchCount)
|
||||
{
|
||||
fprintf(stderr, "%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
|
||||
// failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(mismatchCount == 0)
|
||||
{
|
||||
failed("expected mismatches but did not detect any!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---
|
||||
struct Pinned
|
||||
{
|
||||
static const bool isPinned = true;
|
||||
static const char* str() { return "Pinned"; };
|
||||
|
||||
static void* Alloc(size_t sizeBytes)
|
||||
{
|
||||
void* p;
|
||||
HIPCHECK(hipHostMalloc((void**) &p, sizeBytes));
|
||||
return p;
|
||||
};
|
||||
};
|
||||
|
||||
//---
|
||||
struct Unpinned
|
||||
{
|
||||
static const bool isPinned = false;
|
||||
static const char* str() { return "Unpinned"; };
|
||||
|
||||
static void* Alloc(size_t sizeBytes)
|
||||
{
|
||||
void* p = malloc(sizeBytes);
|
||||
HIPASSERT(p);
|
||||
return p;
|
||||
};
|
||||
};
|
||||
|
||||
struct Memcpy
|
||||
{
|
||||
static const char* str() { return "Memcpy"; };
|
||||
};
|
||||
|
||||
struct MemcpyAsync
|
||||
{
|
||||
static const char* str() { return "MemcpyAsync"; };
|
||||
};
|
||||
|
||||
template <typename C>
|
||||
struct MemTraits;
|
||||
|
||||
template <>
|
||||
struct MemTraits<Memcpy>
|
||||
{
|
||||
static void Copy(void* dest, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t)
|
||||
{
|
||||
HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MemTraits<MemcpyAsync>
|
||||
{
|
||||
static void Copy(void* dest,
|
||||
const void* src,
|
||||
size_t sizeBytes,
|
||||
hipMemcpyKind kind,
|
||||
hipStream_t stream)
|
||||
{
|
||||
HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream));
|
||||
}
|
||||
};
|
||||
|
||||
inline bool
|
||||
isImageSupported()
|
||||
{
|
||||
int imageSupport = 1;
|
||||
# ifdef __HIP_PLATFORM_AMD__
|
||||
HIPCHECK(hipDeviceGetAttribute(&imageSupport, hipDeviceAttributeImageSupport, p_gpuDevice));
|
||||
# endif
|
||||
return imageSupport != 0;
|
||||
}
|
||||
|
||||
}; // namespace HipTest
|
||||
|
||||
// This must be called in the beginning of image test app's main() to indicate whether image
|
||||
// is supported.
|
||||
# define checkImageSupport() \
|
||||
if(!HipTest::isImageSupported()) \
|
||||
{ \
|
||||
printf("Texture is not support on the device. Skipped.\n"); \
|
||||
passed(); \
|
||||
}
|
||||
#endif //__cplusplus
|
||||
|
||||
// Function to determine if the device is of gfx11 architecture
|
||||
bool
|
||||
IsGfx11();
|
||||
Spustitelný soubor
+1487
Rozdílový obsah nebyl zobrazen, protože je příliš veliký
Načíst rozdílové porovnání
@@ -0,0 +1,11 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
# install the downloaded timem libraries
|
||||
install(
|
||||
DIRECTORY ${PROJECT_BINARY_DIR}/lib64/
|
||||
DESTINATION ${CMAKE_INSTALL_LIB64DIR}
|
||||
FILES_MATCHING
|
||||
PATTERN "*libtimem.*"
|
||||
PATTERN "*/timemory/*")
|
||||
@@ -0,0 +1,5 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
add_subdirectory(rocprofiler-sdk)
|
||||
@@ -0,0 +1,12 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
set(DATA_FILES benchmark_tables.sql benchmark_views.sql)
|
||||
|
||||
foreach(_FILE ${DATA_FILES})
|
||||
configure_file(${_FILE} ${CMAKE_DATAROOT_OUTPUT_DIRECTORY}/rocprofiler-sdk/${_FILE}
|
||||
COPYONLY)
|
||||
endforeach()
|
||||
|
||||
install(FILES ${DATA_FILES} DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-sdk)
|
||||
+137
@@ -0,0 +1,137 @@
|
||||
-- Application used for benchmarking
|
||||
-- Columns such "hip_compiler_api", ..., "scratch_memory" are
|
||||
-- the number of events in the given category, e.g. kernel_dispatch
|
||||
-- represents the number of kernel dispatches in the app. These
|
||||
-- can be approximate since for a given application, the exact
|
||||
-- count may vary.
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
`benchmarked_app` (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT UNIQUE,
|
||||
hash_id TEXT NOT NULL,
|
||||
md5sum TEXT NOT NULL,
|
||||
revision INT DEFAULT 0,
|
||||
command JSON NOT NULL,
|
||||
compiler_id TEXT,
|
||||
compiler_version TEXT,
|
||||
library_arch TEXT,
|
||||
system_name TEXT,
|
||||
system_processor TEXT,
|
||||
system_version TEXT,
|
||||
threads INT,
|
||||
hip_compiler_api INT,
|
||||
hip_runtime_api INT,
|
||||
hsa_api INT,
|
||||
kernel_dispatch INT,
|
||||
marker_api INT,
|
||||
memory_allocation INT,
|
||||
memory_copy INT,
|
||||
ompt INT,
|
||||
rccl_api INT,
|
||||
rocdecode_api INT,
|
||||
rocjpeg_api INT,
|
||||
scratch_memory INT,
|
||||
environment JSON DEFAULT ("{}")
|
||||
);
|
||||
|
||||
-- rocprofiler-sdk used for benchmarking
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
`benchmarked_sdk` (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT UNIQUE,
|
||||
hash_id TEXT NOT NULL,
|
||||
version_major INT NOT NULL,
|
||||
version_minor INT NOT NULL,
|
||||
version_patch INT NOT NULL,
|
||||
soversion INT NOT NULL,
|
||||
compiler_id TEXT NOT NULL,
|
||||
compiler_version TEXT NOT NULL,
|
||||
git_revision TEXT NOT NULL,
|
||||
library_arch TEXT NOT NULL,
|
||||
system_name TEXT NOT NULL,
|
||||
system_processor TEXT NOT NULL,
|
||||
system_version TEXT NOT NULL
|
||||
);
|
||||
|
||||
-- rocprofiler-sdk used for benchmarking
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
`benchmark_config` (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT UNIQUE,
|
||||
hash_id TEXT NOT NULL,
|
||||
sdk_id INT,
|
||||
label TEXT, -- name identifier
|
||||
benchmark_mode TEXT CHECK (
|
||||
benchmark_mode IN (
|
||||
"baseline",
|
||||
"disabled-sdk-contexts",
|
||||
"sdk-buffer-overhead",
|
||||
"sdk-callback-overhead",
|
||||
"tool-runtime-overhead"
|
||||
)
|
||||
) NOT NULL,
|
||||
kernel_rename INT,
|
||||
group_by_queue INT,
|
||||
kernel_trace INT,
|
||||
hsa_trace INT,
|
||||
hip_runtime_trace INT,
|
||||
hip_compiler_trace INT,
|
||||
marker_trace INT,
|
||||
memory_copy_trace INT,
|
||||
memory_allocation_trace INT,
|
||||
scratch_memory_trace INT,
|
||||
dispatch_counter_collection INT,
|
||||
rccl_trace INT,
|
||||
rocdecode_trace INT,
|
||||
rocjpeg_trace INT,
|
||||
pmc_counters JSON DEFAULT ("[]"),
|
||||
pc_sampling_host_trap INT,
|
||||
pc_sampling_stocastic INT,
|
||||
advanced_thread_trace INT,
|
||||
--
|
||||
-- Eventually, we will create tables for storing the subconfigurations for pc sampling and ATT
|
||||
--
|
||||
-- pc_sampling_host_trap_config_id INT,
|
||||
-- pc_sampling_stocastic_config_id INT,
|
||||
-- advanced_thread_trace_config_id INT,
|
||||
-- FOREIGN KEY (pc_sampling_host_trap_config_id) REFERENCES benchmark_pc_sampling_host_trap_config (id) ON UPDATE CASCADE,
|
||||
-- FOREIGN KEY (pc_sampling_stocastic_config_id) REFERENCES benchmark_pc_sampling_stocastic_config (id) ON UPDATE CASCADE,
|
||||
-- FOREIGN KEY (advanced_thread_trace_config_id) REFERENCES benchmark_advanced_thread_trace_config (id) ON UPDATE CASCADE
|
||||
FOREIGN KEY (sdk_id) REFERENCES benchmarked_sdk (id) ON UPDATE CASCADE
|
||||
);
|
||||
|
||||
-- metrics for the benchmark
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
`benchmark_metrics` (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT UNIQUE,
|
||||
app_id INT NOT NULL,
|
||||
cfg_id INT NOT NULL,
|
||||
sdk_id INT,
|
||||
executed_at TIMESTAMP NOT NULL,
|
||||
wall_time DOUBLE NOT NULL,
|
||||
cpu_time DOUBLE NOT NULL,
|
||||
cpu_util DOUBLE NOT NULL,
|
||||
peak_rss DOUBLE NOT NULL,
|
||||
page_rss DOUBLE NOT NULL,
|
||||
virtual_memory DOUBLE NOT NULL,
|
||||
major_page_faults BIGINT NOT NULL,
|
||||
minor_page_faults BIGINT NOT NULL,
|
||||
priority_context_switches BIGINT NOT NULL,
|
||||
voluntary_context_switches BIGINT NOT NULL,
|
||||
FOREIGN KEY (app_id) REFERENCES benchmarked_app (id) ON UPDATE CASCADE,
|
||||
FOREIGN KEY (cfg_id) REFERENCES benchmark_config (id) ON UPDATE CASCADE,
|
||||
FOREIGN KEY (sdk_id) REFERENCES benchmarked_sdk (id) ON UPDATE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
`benchmark_statistics` (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT UNIQUE,
|
||||
app_id INT NOT NULL,
|
||||
cfg_id INT NOT NULL,
|
||||
sdk_id INT,
|
||||
metric_name TEXT NOT NULL,
|
||||
metric_unit TEXT NOT NULL,
|
||||
count INT NOT NULL,
|
||||
sum DOUBLE NOT NULL,
|
||||
mean DOUBLE NOT NULL,
|
||||
min DOUBLE NOT NULL,
|
||||
max DOUBLE NOT NULL,
|
||||
std_dev DOUBLE
|
||||
);
|
||||
+75
@@ -0,0 +1,75 @@
|
||||
-- Analysis views used for benchmarking
|
||||
CREATE VIEW IF NOT EXISTS
|
||||
`benchmark_analysis_{{metric}}` AS
|
||||
WITH
|
||||
baseline AS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
benchmark_statistics BL
|
||||
WHERE
|
||||
BL.sdk_id IS NULL
|
||||
AND BL.metric_name = "{{metric}}"
|
||||
)
|
||||
SELECT
|
||||
ST.id,
|
||||
ST.app_id,
|
||||
ST.cfg_id,
|
||||
ST.sdk_id,
|
||||
BS.git_revision,
|
||||
BA.command,
|
||||
ST.metric_name,
|
||||
ST.metric_unit,
|
||||
ST.count,
|
||||
ST.mean,
|
||||
ST.std_dev AS `+/-`,
|
||||
BL.mean AS baseline_mean,
|
||||
BL.std_dev AS `+/- (baseline)`,
|
||||
((ST.mean - BL.mean) / BL.mean) * 100 AS `overhead (%)`,
|
||||
BC.benchmark_mode,
|
||||
BC.label AS benchmark_label
|
||||
FROM
|
||||
benchmark_statistics ST
|
||||
JOIN benchmark_config BC ON BC.id = ST.cfg_id
|
||||
JOIN benchmarked_sdk BS ON BS.id = ST.sdk_id
|
||||
JOIN benchmarked_app BA ON BA.id = ST.app_id
|
||||
JOIN baseline BL ON (
|
||||
BL.app_id = ST.app_id
|
||||
AND BL.metric_name = ST.metric_name
|
||||
)
|
||||
WHERE
|
||||
ST.metric_name = "{{metric}}"
|
||||
AND ST.sdk_id IS NOT NULL
|
||||
ORDER BY
|
||||
`overhead (%)` DESC;
|
||||
|
||||
-- benchmarked_app without environment info
|
||||
CREATE VIEW IF NOT EXISTS
|
||||
`benchmarked_app_without_env` AS
|
||||
SELECT
|
||||
id,
|
||||
hash_id,
|
||||
md5sum,
|
||||
revision,
|
||||
command,
|
||||
compiler_id,
|
||||
compiler_version,
|
||||
library_arch,
|
||||
system_name,
|
||||
system_processor,
|
||||
system_version,
|
||||
threads,
|
||||
hip_compiler_api,
|
||||
hip_runtime_api,
|
||||
hsa_api,
|
||||
kernel_dispatch,
|
||||
marker_api,
|
||||
memory_allocation,
|
||||
memory_copy,
|
||||
ompt,
|
||||
rccl_api,
|
||||
rocdecode_api,
|
||||
rocjpeg_api,
|
||||
scratch_memory
|
||||
FROM
|
||||
benchmarked_app;
|
||||
@@ -60,8 +60,8 @@ list(LENGTH ROCPROFILER_PACKAGING_COMPONENTS NUM_ROCPROFILER_PACKAGING_COMPONENT
|
||||
# the packages we will generate
|
||||
set(ROCPROFILER_COMPONENT_GROUPS "core" "docs" "tests" "roctx")
|
||||
|
||||
set(COMPONENT_GROUP_core_COMPONENTS "core" "development" "samples" "tools" "Development"
|
||||
"Unspecified")
|
||||
set(COMPONENT_GROUP_core_COMPONENTS "core" "development" "samples" "tools" "benchmark"
|
||||
"Development" "Unspecified")
|
||||
set(COMPONENT_GROUP_docs_COMPONENTS "docs")
|
||||
set(COMPONENT_GROUP_tests_COMPONENTS "tests")
|
||||
set(COMPONENT_GROUP_roctx_COMPONENTS "roctx")
|
||||
@@ -87,7 +87,10 @@ set(COMPONENT_DESC_roctx "ROCm Tools Extension library and headers")
|
||||
|
||||
set(EXPECTED_PACKAGING_COMPONENTS 6)
|
||||
if(ROCPROFILER_BUILD_DOCS)
|
||||
set(EXPECTED_PACKAGING_COMPONENTS 7)
|
||||
math(EXPR EXPECTED_PACKAGING_COMPONENTS "${EXPECTED_PACKAGING_COMPONENTS} + 1")
|
||||
endif()
|
||||
if(ROCPROFILER_BUILD_BENCHMARK)
|
||||
math(EXPR EXPECTED_PACKAGING_COMPONENTS "${EXPECTED_PACKAGING_COMPONENTS} + 1")
|
||||
endif()
|
||||
|
||||
if(NOT NUM_ROCPROFILER_PACKAGING_COMPONENTS EQUAL EXPECTED_PACKAGING_COMPONENTS)
|
||||
|
||||
@@ -74,7 +74,7 @@ if(ROCPROFILER_CLANG_FORMAT_EXE
|
||||
set(rocp_cmake_files ${PROJECT_SOURCE_DIR}/CMakeLists.txt
|
||||
${PROJECT_SOURCE_DIR}/external/CMakeLists.txt)
|
||||
|
||||
foreach(_DIR cmake samples source tests)
|
||||
foreach(_DIR cmake samples source tests benchmark)
|
||||
foreach(_TYPE header_files source_files cmake_files python_files)
|
||||
set(${_TYPE})
|
||||
endforeach()
|
||||
|
||||
@@ -38,6 +38,7 @@ rocprofiler_add_option(ROCPROFILER_BUILD_TESTS "Enable building the tests"
|
||||
${ROCPROFILER_BUILD_CI})
|
||||
rocprofiler_add_option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples"
|
||||
${ROCPROFILER_BUILD_CI})
|
||||
rocprofiler_add_option(ROCPROFILER_BUILD_BENCHMARK "Enable building the benchmarks" OFF)
|
||||
rocprofiler_add_option(
|
||||
ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS
|
||||
"Disable adjusting for clock skew b/t CPU and GPU timestamps" OFF ADVANCED)
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
ARG BASE_IMAGE=rocm/rocm-terminal
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG BRANCH=amd-staging
|
||||
|
||||
RUN git clone -b ${BRANCH} https://github.com/ROCm/rocprofiler-sdk.git rocprofiler-sdk-source && \
|
||||
python3 -m pip install -r rocprofiler-sdk-source/requirements.txt && \
|
||||
sudo apt update && \
|
||||
sudo apt install -y libdw-dev && \
|
||||
cmake -B rocprofiler-sdk-build -DCMAKE_BUILD_TYPE=RelWithDebInfo -DROCPROFILER_BUILD_{SAMPLES,TESTS,BENCHMARK}=ON -DPython3_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$(realpath /opt/rocm) rocprofiler-sdk-source && \
|
||||
cmake --build rocprofiler-sdk-build --target all --parallel 16 && \
|
||||
sudo cmake --build rocprofiler-sdk-build --target install && \
|
||||
sudo rm -rf rocprofiler-sdk-source rocprofiler-sdk-build
|
||||
@@ -248,6 +248,11 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
choices=("csv", "json", "pftrace", "otf2"),
|
||||
type=str.lower,
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
io_options,
|
||||
"--output-config",
|
||||
help="Generate a output file of the rocprofv3 configuration, e.g. out_config.json",
|
||||
)
|
||||
io_options.add_argument(
|
||||
"--log-level",
|
||||
help="Set the desired log level",
|
||||
@@ -518,6 +523,11 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
type=str,
|
||||
choices=("hour", "min", "sec", "msec", "usec", "nsec"),
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
filter_options,
|
||||
"--selected-regions",
|
||||
help="If set, rocprofv3 will only profile regions of code surrounded by roctxProfilerResume(0) and roctxProfilerPause(0)",
|
||||
)
|
||||
|
||||
perfetto_options = parser.add_argument_group("Perfetto-specific options")
|
||||
|
||||
@@ -592,6 +602,19 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
"--realpath",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
advanced_options.add_argument(
|
||||
"--benchmark-mode",
|
||||
choices=(
|
||||
"disabled-sdk-contexts",
|
||||
"sdk-buffer-overhead",
|
||||
"sdk-callback-overhead",
|
||||
"tool-runtime-overhead",
|
||||
"execution-profile",
|
||||
),
|
||||
help=argparse.SUPPRESS,
|
||||
default=None,
|
||||
type=str.lower,
|
||||
)
|
||||
advanced_options.add_argument(
|
||||
"-A",
|
||||
"--agent-index",
|
||||
@@ -1034,6 +1057,7 @@ def run(app_args, args, **kwargs):
|
||||
|
||||
update_env("ROCPROF_OUTPUT_FILE_NAME", _output_file)
|
||||
update_env("ROCPROF_OUTPUT_PATH", _output_path)
|
||||
update_env("ROCPROF_OUTPUT_CONFIG_FILE", args.output_config, overwrite_if_true=True)
|
||||
if app_pass is not None and args.sub_directory is not None:
|
||||
app_env["ROCPROF_OUTPUT_PATH"] = os.path.join(
|
||||
f"{_output_path}", f"{args.sub_directory}{app_pass}"
|
||||
@@ -1198,6 +1222,11 @@ def run(app_args, args, **kwargs):
|
||||
args.truncate_kernels,
|
||||
overwrite_if_true=True,
|
||||
)
|
||||
update_env(
|
||||
"ROCPROF_SELECTED_REGIONS",
|
||||
args.selected_regions,
|
||||
overwrite_if_true=True,
|
||||
)
|
||||
|
||||
if args.list_avail:
|
||||
update_env(
|
||||
@@ -1241,6 +1270,24 @@ def run(app_args, args, **kwargs):
|
||||
args.log_level,
|
||||
)
|
||||
|
||||
if args.benchmark_mode:
|
||||
if args.benchmark_mode == "execution-profile":
|
||||
if args.group_by_queue is None:
|
||||
update_env(
|
||||
"ROCPROF_GROUP_BY_QUEUE",
|
||||
False,
|
||||
overwrite=True,
|
||||
)
|
||||
elif args.group_by_queue:
|
||||
fatal_error(
|
||||
"rocprofv3 requires --group-by-queue=false for --benchmark-mode=execution-profile"
|
||||
)
|
||||
|
||||
update_env(
|
||||
"ROCPROF_BENCHMARK_MODE",
|
||||
args.benchmark_mode,
|
||||
)
|
||||
|
||||
for opt, env_val in dict(
|
||||
[
|
||||
["kernel_rename", "KERNEL_RENAME"],
|
||||
|
||||
@@ -46,4 +46,4 @@ To run the built samples, ``cd`` into the ``build-rocprofiler-sdk-samples`` dire
|
||||
|
||||
ctest -V
|
||||
|
||||
The `-V` option enables verbose output, providing detailed information about the test execution.
|
||||
The `-V` option enables verbose output, providing detailed information about the test execution.
|
||||
|
||||
@@ -126,4 +126,4 @@ If you have ROCm version 6.2 or higher installed, you can use the package manage
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ sudo zypper install rocprofiler-sdk
|
||||
$ sudo zypper install rocprofiler-sdk
|
||||
|
||||
@@ -55,7 +55,7 @@
|
||||
}
|
||||
|
||||
#define ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(TYPE) \
|
||||
inline bool operator>(TYPE lhs, TYPE rhs) { return (lhs == rhs || !(lhs < rhs)); } \
|
||||
inline bool operator>(TYPE lhs, TYPE rhs) { return !(lhs == rhs || lhs < rhs); } \
|
||||
inline bool operator<=(TYPE lhs, TYPE rhs) { return (lhs == rhs || lhs < rhs); } \
|
||||
inline bool operator>=(TYPE lhs, TYPE rhs) { return !(lhs < rhs); }
|
||||
|
||||
|
||||
@@ -135,6 +135,29 @@ rocprofiler_configure_external_correlation_id_request_service(
|
||||
rocprofiler_external_correlation_id_request_cb_t callback,
|
||||
void* callback_args) ROCPROFILER_API ROCPROFILER_NONNULL(4);
|
||||
|
||||
/**
|
||||
* @brief Query the name of the external correlation request kind. The name retrieved from this
|
||||
* function is a string literal that is encoded in the read-only section of the binary (i.e. it is
|
||||
* always "allocated" and never "deallocated").
|
||||
*
|
||||
* @param [in] kind External correlation id request domain
|
||||
* @param [out] name If non-null and the name is a constant string that does not require dynamic
|
||||
* allocation, this paramter will be set to the address of the string literal, otherwise it will
|
||||
* be set to nullptr
|
||||
* @param [out] name_len If non-null, this will be assigned the length of the name (regardless of
|
||||
* the name is a constant string or requires dynamic allocation)
|
||||
* @return ::rocprofiler_status_t
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR_KIND_NOT_FOUND Returned if the domain id is not valid
|
||||
* @retval ::ROCPROFILER_STATUS_SUCCESS Returned if a valid domain, regardless if there is a
|
||||
* constant string or not.
|
||||
*/
|
||||
ROCPROFILER_SDK_EXPERIMENTAL
|
||||
rocprofiler_status_t
|
||||
rocprofiler_query_external_correlation_id_request_kind_name(
|
||||
rocprofiler_external_correlation_id_request_kind_t kind,
|
||||
const char** name,
|
||||
uint64_t* name_len) ROCPROFILER_API;
|
||||
|
||||
/**
|
||||
* @brief Push default value for `external` field in ::rocprofiler_correlation_id_t onto stack.
|
||||
*
|
||||
|
||||
@@ -94,7 +94,7 @@
|
||||
#define ROCPROFILER_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
|
||||
#define ROCPROFILER_VERSION_MINOR @PROJECT_VERSION_MINOR@
|
||||
#define ROCPROFILER_VERSION_PATCH @PROJECT_VERSION_PATCH@
|
||||
#define ROCPROFILER_SOVERSION (10000 * @PROJECT_VERSION_MAJOR@)
|
||||
#define ROCPROFILER_SOVERSION @PROJECT_VERSION_MAJOR@
|
||||
#define ROCPROFILER_VERSION_STRING "@FULL_VERSION_STRING@"
|
||||
#define ROCPROFILER_GIT_DESCRIBE "@ROCPROFILER_SDK_GIT_DESCRIBE@"
|
||||
#define ROCPROFILER_GIT_REVISION "@ROCPROFILER_SDK_GIT_REVISION@"
|
||||
|
||||
@@ -130,6 +130,7 @@ output_config::save(ArchiveT& ar) const
|
||||
CFG_SERIALIZE_MEMBER(otf2_output);
|
||||
CFG_SERIALIZE_MEMBER(summary_output);
|
||||
CFG_SERIALIZE_MEMBER(kernel_rename);
|
||||
CFG_SERIALIZE_MEMBER(group_by_queue);
|
||||
|
||||
#undef CFG_SERIALIZE_MEMBER
|
||||
#undef CFG_SERIALIZE_NAMED_MEMBER
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
rocprofiler_activate_clang_tidy()
|
||||
|
||||
set(TOOL_HEADERS config.hpp helper.hpp stream_stack.hpp)
|
||||
set(TOOL_HEADERS config.hpp execution_profile.hpp helper.hpp stream_stack.hpp)
|
||||
|
||||
set(TOOL_SOURCES config.cpp main.c tool.cpp stream_stack.cpp)
|
||||
|
||||
|
||||
@@ -319,6 +319,23 @@ config::config()
|
||||
std::stoull(_config_params.at(2))});
|
||||
}
|
||||
}
|
||||
|
||||
// Benchmarking Enable/Disable
|
||||
if(!benchmark_mode_env.empty())
|
||||
{
|
||||
const auto valid_options = std::unordered_map<std::string_view, config::benchmark>{
|
||||
{"disabled-sdk-contexts", benchmark::disabled_contexts_overhead},
|
||||
{"sdk-buffer-overhead", benchmark::sdk_buffered_overhead},
|
||||
{"sdk-callback-overhead", benchmark::sdk_callback_overhead},
|
||||
{"tool-runtime-overhead", benchmark::tool_runtime_overhead},
|
||||
{"execution-profile", benchmark::execution_profile},
|
||||
};
|
||||
|
||||
ROCP_FATAL_IF(valid_options.count(benchmark_mode_env) == 0)
|
||||
<< fmt::format("Invalid value for ROCPROF_BENCHMARK_MODE: {}", benchmark_mode_env);
|
||||
|
||||
benchmark_mode = valid_options.at(benchmark_mode_env);
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
|
||||
@@ -87,6 +87,16 @@ struct config : output_config
|
||||
void save(ArchiveT& ar) const;
|
||||
};
|
||||
|
||||
enum class benchmark
|
||||
{
|
||||
none = 0,
|
||||
disabled_contexts_overhead,
|
||||
sdk_callback_overhead,
|
||||
sdk_buffered_overhead,
|
||||
tool_runtime_overhead,
|
||||
execution_profile,
|
||||
};
|
||||
|
||||
config();
|
||||
|
||||
~config() = default;
|
||||
@@ -114,11 +124,13 @@ struct config : output_config
|
||||
bool rocjpeg_api_trace = get_env("ROCPROF_ROCJPEG_API_TRACE", false);
|
||||
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
||||
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
||||
bool pc_sampling_host_trap = false;
|
||||
bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false);
|
||||
bool pc_sampling_stochastic = false;
|
||||
bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false);
|
||||
bool enable_signal_handlers = get_env("ROCPROF_SIGNAL_HANDLERS", true);
|
||||
bool selected_regions = get_env("ROCPROF_SELECTED_REGIONS", false);
|
||||
bool output_config_file = get_env("ROCPROF_OUTPUT_CONFIG_FILE", false);
|
||||
bool pc_sampling_host_trap = false;
|
||||
bool pc_sampling_stochastic = false;
|
||||
size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1);
|
||||
rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE;
|
||||
rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE;
|
||||
@@ -148,6 +160,9 @@ struct config : output_config
|
||||
uint64_t counter_groups_interval = get_env("ROCPROF_COUNTER_GROUPS_INTERVAL", 1);
|
||||
uint64_t minimum_output_bytes = get_env("ROCPROF_MINIMUM_OUTPUT_BYTES", 0);
|
||||
|
||||
std::string benchmark_mode_env = get_env("ROCPROF_BENCHMARK_MODE", "");
|
||||
benchmark benchmark_mode = benchmark::none;
|
||||
|
||||
template <typename ArchiveT>
|
||||
void save(ArchiveT&) const;
|
||||
|
||||
@@ -180,6 +195,8 @@ template <typename ArchiveT>
|
||||
void
|
||||
config::save(ArchiveT& ar) const
|
||||
{
|
||||
CFG_SERIALIZE_NAMED_MEMBER("benchmark_mode", benchmark_mode_env);
|
||||
|
||||
CFG_SERIALIZE_MEMBER(kernel_trace);
|
||||
CFG_SERIALIZE_MEMBER(hsa_core_api_trace);
|
||||
CFG_SERIALIZE_MEMBER(hsa_amd_ext_api_trace);
|
||||
@@ -194,6 +211,7 @@ config::save(ArchiveT& ar) const
|
||||
CFG_SERIALIZE_MEMBER(hip_compiler_api_trace);
|
||||
CFG_SERIALIZE_MEMBER(rccl_api_trace);
|
||||
CFG_SERIALIZE_MEMBER(rocdecode_api_trace);
|
||||
CFG_SERIALIZE_MEMBER(rocjpeg_api_trace);
|
||||
|
||||
CFG_SERIALIZE_MEMBER(mpi_rank);
|
||||
CFG_SERIALIZE_MEMBER(mpi_size);
|
||||
@@ -207,7 +225,13 @@ config::save(ArchiveT& ar) const
|
||||
CFG_SERIALIZE_MEMBER(truncate);
|
||||
CFG_SERIALIZE_MEMBER(minimum_output_bytes);
|
||||
CFG_SERIALIZE_MEMBER(enable_signal_handlers);
|
||||
CFG_SERIALIZE_MEMBER(selected_regions);
|
||||
|
||||
CFG_SERIALIZE_MEMBER(counter_groups_random_seed);
|
||||
CFG_SERIALIZE_MEMBER(counter_groups_interval);
|
||||
|
||||
CFG_SERIALIZE_MEMBER(pc_sampling_host_trap);
|
||||
CFG_SERIALIZE_MEMBER(pc_sampling_stochastic);
|
||||
CFG_SERIALIZE_MEMBER(pc_sampling_method);
|
||||
CFG_SERIALIZE_MEMBER(pc_sampling_unit);
|
||||
CFG_SERIALIZE_MEMBER(pc_sampling_interval);
|
||||
@@ -222,8 +246,10 @@ config::save(ArchiveT& ar) const
|
||||
CFG_SERIALIZE_MEMBER(att_param_target_cu);
|
||||
CFG_SERIALIZE_MEMBER(att_capability);
|
||||
CFG_SERIALIZE_MEMBER(att_param_perfcounters);
|
||||
CFG_SERIALIZE_MEMBER(att_param_perf_ctrl);
|
||||
|
||||
static_cast<const base_type&>(*this).save(ar);
|
||||
// serialize the base class
|
||||
static_cast<const base_type*>(this)->save(ar);
|
||||
}
|
||||
|
||||
#undef CFG_SERIALIZE_MEMBER
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lib/output/metadata.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/external_correlation.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
|
||||
#include <cereal/cereal.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace tool
|
||||
{
|
||||
struct execution_profile_data
|
||||
{
|
||||
using extern_corr_id_request_t = rocprofiler_external_correlation_id_request_kind_t;
|
||||
using operation_set_t = std::unordered_set<rocprofiler_tracing_operation_t>;
|
||||
|
||||
std::unordered_map<extern_corr_id_request_t, uint64_t> category_count = {};
|
||||
std::unordered_map<extern_corr_id_request_t, operation_set_t> category_op_count = {};
|
||||
std::unordered_set<rocprofiler_thread_id_t> threads = {};
|
||||
std::unordered_set<rocprofiler_context_id_t> contexts = {};
|
||||
};
|
||||
|
||||
struct execution_profile_category_data
|
||||
{
|
||||
uint64_t count = 0; // total invocations of a given category
|
||||
uint64_t unique = 0; // number of unique operations
|
||||
};
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
namespace cereal
|
||||
{
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, ::rocprofiler::tool::execution_profile_category_data data)
|
||||
{
|
||||
ar(cereal::make_nvp("count", data.count));
|
||||
ar(cereal::make_nvp("unique", data.unique));
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, const ::rocprofiler::tool::execution_profile_data& data)
|
||||
{
|
||||
namespace tool = ::rocprofiler::tool;
|
||||
|
||||
using category_count_map_t = std::map<std::string, tool::execution_profile_category_data>;
|
||||
|
||||
auto _category_count = category_count_map_t{};
|
||||
for(auto itr : data.category_count)
|
||||
{
|
||||
const char* _name = nullptr;
|
||||
ROCPROFILER_CHECK(rocprofiler_query_external_correlation_id_request_kind_name(
|
||||
itr.first, &_name, nullptr));
|
||||
if(_name)
|
||||
{
|
||||
auto _unique_ops = data.category_op_count.at(itr.first).size();
|
||||
auto _kind_name = std::string{_name};
|
||||
std::for_each(
|
||||
_kind_name.begin(), _kind_name.end(), [](auto& v) { v = ::std::tolower(v); });
|
||||
|
||||
_category_count.emplace(_kind_name,
|
||||
tool::execution_profile_category_data{itr.second, _unique_ops});
|
||||
}
|
||||
}
|
||||
|
||||
ar(cereal::make_nvp("threads", data.threads.size()));
|
||||
ar(cereal::make_nvp("contexts", data.contexts.size()));
|
||||
for(auto itr : _category_count)
|
||||
ar(cereal::make_nvp(itr.first.c_str(), itr.second));
|
||||
}
|
||||
} // namespace cereal
|
||||
@@ -20,10 +20,13 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "rocprofiler-sdk/defines.h"
|
||||
#include "rocprofiler-sdk/dispatch_counting_service.h"
|
||||
#define _GNU_SOURCE 1
|
||||
#define _DEFAULT_SOURCE 1
|
||||
|
||||
#include "config.hpp"
|
||||
#include "execution_profile.hpp"
|
||||
#include "helper.hpp"
|
||||
#include "stream_stack.hpp"
|
||||
|
||||
@@ -50,6 +53,7 @@
|
||||
#include "lib/output/output_stream.hpp"
|
||||
#include "lib/output/statistics.hpp"
|
||||
#include "lib/output/stream_info.hpp"
|
||||
#include "lib/output/timestamps.hpp"
|
||||
#include "lib/output/tmp_file.hpp"
|
||||
#include "lib/output/tmp_file_buffer.hpp"
|
||||
|
||||
@@ -63,11 +67,13 @@
|
||||
#include <rocprofiler-sdk/internal_threading.h>
|
||||
#include <rocprofiler-sdk/marker/api_id.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include <rocprofiler-sdk/version.h>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
|
||||
#include <fmt/core.h>
|
||||
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
@@ -146,14 +152,6 @@ add_destructor(Tp*& ptr)
|
||||
return ptr;
|
||||
}
|
||||
|
||||
#define ADD_DESTRUCTOR(PTR) \
|
||||
{ \
|
||||
static auto _once = std::once_flag{}; \
|
||||
std::call_once(_once, []() { add_destructor(PTR); }); \
|
||||
}
|
||||
|
||||
#undef ADD_DESTRUCTOR
|
||||
|
||||
struct chained_siginfo
|
||||
{
|
||||
int signo = 0;
|
||||
@@ -207,6 +205,11 @@ struct buffer_ids
|
||||
rocjpeg_api_trace,
|
||||
pc_sampling_stochastic};
|
||||
}
|
||||
auto pc_sampling_buffers_as_array() const
|
||||
{
|
||||
return std::array<rocprofiler_buffer_id_t, 2>{pc_sampling_host_trap,
|
||||
pc_sampling_stochastic};
|
||||
}
|
||||
};
|
||||
|
||||
buffer_ids&
|
||||
@@ -246,10 +249,11 @@ using kernel_iteration_t = std::unordered_map<rocprofiler_kernel_id_t, size_t
|
||||
using kernel_rename_map_t = std::unordered_map<uint64_t, uint64_t>;
|
||||
using kernel_rename_stack_t = std::stack<uint64_t>;
|
||||
|
||||
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
|
||||
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
|
||||
std::mutex att_shader_data;
|
||||
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
|
||||
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
|
||||
auto* execution_profile = as_pointer<common::Synchronized<tool::execution_profile_data>>();
|
||||
auto counter_collection_ctx = rocprofiler_context_id_t{0};
|
||||
std::mutex att_shader_data;
|
||||
|
||||
thread_local auto thread_dispatch_rename = as_pointer<kernel_rename_stack_t>();
|
||||
thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() {
|
||||
@@ -336,10 +340,12 @@ get_client_ctx()
|
||||
void
|
||||
flush()
|
||||
{
|
||||
constexpr auto null_buffer_id = rocprofiler_buffer_id_t{.handle = 0};
|
||||
|
||||
ROCP_INFO << "flushing buffers...";
|
||||
for(auto itr : get_buffers().as_array())
|
||||
{
|
||||
if(itr.handle > 0)
|
||||
if(itr > null_buffer_id)
|
||||
{
|
||||
ROCP_INFO << "flushing buffer " << itr.handle;
|
||||
ROCPROFILER_CALL(rocprofiler_flush_buffer(itr), "buffer flush");
|
||||
@@ -419,6 +425,32 @@ collection_period_cntrl(std::promise<void>&& _promise, rocprofiler_context_id_t
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
record_execution_profile(rocprofiler_thread_id_t thr_id,
|
||||
rocprofiler_context_id_t ctx_id,
|
||||
rocprofiler_external_correlation_id_request_kind_t kind,
|
||||
rocprofiler_tracing_operation_t op,
|
||||
uint64_t /*internal_corr_id*/,
|
||||
rocprofiler_user_data_t* /*external_corr_id*/,
|
||||
void* /*user_data*/)
|
||||
{
|
||||
auto _record_data = [](tool::execution_profile_data& _data,
|
||||
rocprofiler_thread_id_t _thr_id,
|
||||
rocprofiler_context_id_t _ctx_id,
|
||||
rocprofiler_external_correlation_id_request_kind_t _kind,
|
||||
rocprofiler_tracing_operation_t _op) {
|
||||
_data.category_count[_kind] += 1;
|
||||
_data.category_op_count[_kind].emplace(_op);
|
||||
_data.threads.emplace(_thr_id);
|
||||
_data.contexts.emplace(_ctx_id);
|
||||
};
|
||||
|
||||
if(execution_profile)
|
||||
execution_profile->wlock(std::move(_record_data), thr_id, ctx_id, kind, op);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
set_kernel_rename_and_stream_display_correlation_id(
|
||||
rocprofiler_thread_id_t thr_id,
|
||||
@@ -623,6 +655,27 @@ runtime_initialization_callback(rocprofiler_callback_tracing_record_t record,
|
||||
common::consume_args(user_data, data);
|
||||
}
|
||||
|
||||
void
|
||||
dummy_callback_tracing_callback(rocprofiler_callback_tracing_record_t /*record*/,
|
||||
rocprofiler_user_data_t* /*user_data*/,
|
||||
void* /*data*/)
|
||||
{}
|
||||
|
||||
void
|
||||
dummy_counter_dispatch_callback(rocprofiler_dispatch_counting_service_data_t,
|
||||
rocprofiler_profile_config_id_t*,
|
||||
rocprofiler_user_data_t*,
|
||||
void*)
|
||||
{}
|
||||
|
||||
void
|
||||
dummy_counter_record_callback(rocprofiler_dispatch_counting_service_data_t,
|
||||
rocprofiler_record_counter_t*,
|
||||
size_t,
|
||||
rocprofiler_user_data_t,
|
||||
void*)
|
||||
{}
|
||||
|
||||
void
|
||||
callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
@@ -865,6 +918,10 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto* sym_data = static_cast<tool::rocprofiler_kernel_symbol_info_t*>(record.payload);
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
|
||||
{
|
||||
ROCP_TRACE << fmt::format("adding kernel symbol info for kernel_id={} :: {}",
|
||||
sym_data->kernel_id,
|
||||
sym_data->kernel_name);
|
||||
|
||||
auto success = CHECK_NOTNULL(tool_metadata)
|
||||
->add_kernel_symbol(kernel_symbol_info{
|
||||
get_dereference(sym_data),
|
||||
@@ -917,6 +974,15 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
(void) data;
|
||||
}
|
||||
|
||||
void
|
||||
dummy_buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
rocprofiler_buffer_id_t /*buffer_id*/,
|
||||
rocprofiler_record_header_t** /*headers*/,
|
||||
size_t /*num_headers*/,
|
||||
void* /*user_data*/,
|
||||
uint64_t /*drop_count*/)
|
||||
{}
|
||||
|
||||
void
|
||||
buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
rocprofiler_buffer_id_t /*buffer_id*/,
|
||||
@@ -1234,12 +1300,12 @@ get_att_perfcounter_params(rocprofiler_agent_id_t agen
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
|
||||
rocprofiler_buffer_id_t /* buffer_id*/,
|
||||
rocprofiler_record_header_t** headers,
|
||||
size_t num_headers,
|
||||
void* /*data*/,
|
||||
uint64_t /* drop_count*/)
|
||||
pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
|
||||
rocprofiler_buffer_id_t /* buffer_id*/,
|
||||
rocprofiler_record_header_t** headers,
|
||||
size_t num_headers,
|
||||
void* /*data*/,
|
||||
uint64_t /* drop_count*/)
|
||||
{
|
||||
if(!headers) return;
|
||||
|
||||
@@ -1345,10 +1411,10 @@ att_dispatch_callback(rocprofiler_agent_id_t /* agent_id */,
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data,
|
||||
rocprofiler_counter_config_id_t* config,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* /*callback_data_args*/)
|
||||
counter_dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data,
|
||||
rocprofiler_counter_config_id_t* config,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
static auto kernel_iteration = common::Synchronized<kernel_iteration_t, true>{};
|
||||
|
||||
@@ -1499,9 +1565,10 @@ if_pc_sample_config_match(rocprofiler_agent_id_t agent_id,
|
||||
}
|
||||
|
||||
void
|
||||
configure_pc_sampling_on_all_agents(uint64_t buffer_size,
|
||||
uint64_t buffer_watermark,
|
||||
void* tool_data)
|
||||
configure_pc_sampling_on_all_agents(uint64_t buffer_size,
|
||||
uint64_t buffer_watermark,
|
||||
void* tool_data,
|
||||
rocprofiler_buffer_tracing_cb_t pc_sampling_cb)
|
||||
{
|
||||
auto method = tool::get_config().pc_sampling_method_value;
|
||||
auto unit = tool::get_config().pc_sampling_unit_value;
|
||||
@@ -1515,7 +1582,7 @@ configure_pc_sampling_on_all_agents(uint64_t buffer_size,
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
rocprofiler_pc_sampling_callback,
|
||||
pc_sampling_cb,
|
||||
tool_data,
|
||||
buffer_id),
|
||||
"buffer creation");
|
||||
@@ -1544,9 +1611,81 @@ configure_pc_sampling_on_all_agents(uint64_t buffer_size,
|
||||
ROCP_FATAL << "Given PC sampling configuration is not supported on any of the agents";
|
||||
}
|
||||
|
||||
struct real_callbacks_t
|
||||
{};
|
||||
|
||||
struct dummy_callbacks_t
|
||||
{};
|
||||
|
||||
constexpr auto use_real_callbacks = real_callbacks_t{};
|
||||
constexpr auto use_dummy_callbacks = dummy_callbacks_t{};
|
||||
|
||||
struct tracing_callbacks_t
|
||||
{
|
||||
tracing_callbacks_t() = delete;
|
||||
|
||||
tracing_callbacks_t(real_callbacks_t)
|
||||
: code_object_tracing{code_object_tracing_callback}
|
||||
, cntrl_tracing{cntrl_tracing_callback}
|
||||
, kernel_rename{kernel_rename_callback}
|
||||
, hip_stream{hip_stream_display_callback}
|
||||
, callback_tracing{callback_tracing_callback}
|
||||
, buffered_tracing{buffered_tracing_callback}
|
||||
, pc_sampling{pc_sampling_callback}
|
||||
, att_dispatch{att_dispatch_callback}
|
||||
, att_shader_data{att_shader_data_callback}
|
||||
, counter_dispatch{counter_dispatch_callback}
|
||||
, counter_record{counter_record_callback}
|
||||
{}
|
||||
|
||||
explicit tracing_callbacks_t(dummy_callbacks_t)
|
||||
: code_object_tracing{dummy_callback_tracing_callback}
|
||||
, cntrl_tracing{dummy_callback_tracing_callback}
|
||||
, kernel_rename{dummy_callback_tracing_callback}
|
||||
, hip_stream{dummy_callback_tracing_callback}
|
||||
, callback_tracing{dummy_callback_tracing_callback}
|
||||
, buffered_tracing{dummy_buffered_tracing_callback}
|
||||
, pc_sampling{dummy_buffered_tracing_callback}
|
||||
, counter_dispatch{dummy_counter_dispatch_callback}
|
||||
, counter_record{dummy_counter_record_callback}
|
||||
{}
|
||||
|
||||
const rocprofiler_callback_tracing_cb_t code_object_tracing = nullptr;
|
||||
const rocprofiler_callback_tracing_cb_t cntrl_tracing = nullptr;
|
||||
const rocprofiler_callback_tracing_cb_t kernel_rename = nullptr;
|
||||
const rocprofiler_callback_tracing_cb_t hip_stream = nullptr;
|
||||
const rocprofiler_callback_tracing_cb_t callback_tracing = nullptr;
|
||||
const rocprofiler_buffer_tracing_cb_t buffered_tracing = nullptr;
|
||||
const rocprofiler_buffer_tracing_cb_t pc_sampling = nullptr;
|
||||
const rocprofiler_att_dispatch_callback_t att_dispatch = nullptr;
|
||||
const rocprofiler_att_shader_data_callback_t att_shader_data = nullptr;
|
||||
const rocprofiler_dispatch_counting_service_cb_t counter_dispatch = nullptr;
|
||||
const rocprofiler_dispatch_counting_record_cb_t counter_record = nullptr;
|
||||
};
|
||||
|
||||
auto
|
||||
get_tracing_callbacks()
|
||||
{
|
||||
// for the benchmarking modes of sdk buffer/callback overhead, we are measuring the cost
|
||||
// of the SDK invoking the callbacks to the tool. We do not want to include the overhead
|
||||
// of the tool doing any work so we use "dummy" callbacks (i.e. functions which just
|
||||
// immediately return)
|
||||
if(tool::get_config().benchmark_mode == tool::config::benchmark::sdk_buffered_overhead ||
|
||||
tool::get_config().benchmark_mode == tool::config::benchmark::sdk_callback_overhead ||
|
||||
tool::get_config().benchmark_mode == tool::config::benchmark::execution_profile)
|
||||
{
|
||||
return tracing_callbacks_t{use_dummy_callbacks};
|
||||
}
|
||||
|
||||
return tracing_callbacks_t{use_real_callbacks};
|
||||
}
|
||||
|
||||
int
|
||||
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
{
|
||||
static constexpr auto null_context_id = rocprofiler_context_id_t{.handle = 0};
|
||||
static constexpr auto null_buffer_id = rocprofiler_buffer_id_t{.handle = 0};
|
||||
|
||||
client_finalizer = fini_func;
|
||||
|
||||
const uint64_t buffer_size = 16 * common::units::get_page_size();
|
||||
@@ -1556,18 +1695,39 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "create context failed");
|
||||
|
||||
auto code_obj_ctx = rocprofiler_context_id_t{0};
|
||||
auto code_obj_ctx = null_context_id;
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&code_obj_ctx), "failed to create context");
|
||||
|
||||
auto start_context = [](rocprofiler_context_id_t ctx_id, std::string_view msg) {
|
||||
using benchmark = tool::config::benchmark;
|
||||
// do not start context if we are benchmarking the overhead of a service
|
||||
// being available but unused by any contexts
|
||||
if(tool::get_config().benchmark_mode != benchmark::disabled_contexts_overhead &&
|
||||
ctx_id != null_context_id)
|
||||
{
|
||||
if(tool::get_config().benchmark_mode == benchmark::execution_profile)
|
||||
{
|
||||
ROCPROFILER_CHECK(rocprofiler_configure_external_correlation_id_request_service(
|
||||
ctx_id, nullptr, 0, record_execution_profile, nullptr));
|
||||
}
|
||||
|
||||
ROCP_INFO << fmt::format("starting {} context...", msg);
|
||||
ROCPROFILER_CHECK(rocprofiler_start_context(ctx_id));
|
||||
}
|
||||
};
|
||||
|
||||
auto callbacks = get_tracing_callbacks();
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_tracing_service(code_obj_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
nullptr,
|
||||
0,
|
||||
code_object_tracing_callback,
|
||||
callbacks.code_object_tracing,
|
||||
nullptr),
|
||||
"code object tracing configure failed");
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(code_obj_ctx), "start context failed");
|
||||
|
||||
start_context(code_obj_ctx, "code object");
|
||||
|
||||
if(tool::get_config().marker_api_trace)
|
||||
{
|
||||
@@ -1576,11 +1736,11 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
nullptr,
|
||||
0,
|
||||
callback_tracing_callback,
|
||||
callbacks.callback_tracing,
|
||||
nullptr),
|
||||
"callback tracing service failed to configure");
|
||||
|
||||
auto pause_resume_ctx = rocprofiler_context_id_t{0};
|
||||
auto pause_resume_ctx = null_context_id;
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&pause_resume_ctx), "failed to create context");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
|
||||
@@ -1588,122 +1748,154 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API,
|
||||
nullptr,
|
||||
0,
|
||||
cntrl_tracing_callback,
|
||||
callbacks.cntrl_tracing,
|
||||
static_cast<void*>(&get_client_ctx())),
|
||||
"callback tracing service failed to configure");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(pause_resume_ctx), "start context failed");
|
||||
start_context(pause_resume_ctx, "marker pause/resume");
|
||||
}
|
||||
|
||||
if(tool::get_config().kernel_trace)
|
||||
struct buffer_service_config
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().kernel_trace),
|
||||
"buffer creation");
|
||||
bool option = false;
|
||||
rocprofiler_buffer_tracing_kind_t kind = ROCPROFILER_BUFFER_TRACING_NONE;
|
||||
rocprofiler_buffer_id_t& buffer_id;
|
||||
};
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().kernel_trace),
|
||||
"buffer tracing service for kernel dispatch configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().memory_copy_trace)
|
||||
for(auto&& itr : {buffer_service_config{tool::get_config().kernel_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH,
|
||||
get_buffers().kernel_trace},
|
||||
buffer_service_config{tool::get_config().memory_copy_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
|
||||
get_buffers().memory_copy_trace},
|
||||
buffer_service_config{tool::get_config().scratch_memory_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
get_buffers().scratch_memory},
|
||||
buffer_service_config{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
|
||||
get_buffers().hsa_api_trace},
|
||||
buffer_service_config{tool::get_config().hsa_amd_ext_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
|
||||
get_buffers().hsa_api_trace},
|
||||
buffer_service_config{tool::get_config().hsa_image_ext_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
|
||||
get_buffers().hsa_api_trace},
|
||||
buffer_service_config{tool::get_config().hsa_finalizer_ext_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API,
|
||||
get_buffers().hsa_api_trace},
|
||||
buffer_service_config{tool::get_config().hip_runtime_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT,
|
||||
get_buffers().hip_api_trace},
|
||||
buffer_service_config{tool::get_config().hip_compiler_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT,
|
||||
get_buffers().hip_api_trace},
|
||||
buffer_service_config{tool::get_config().rccl_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_RCCL_API,
|
||||
get_buffers().rccl_api_trace},
|
||||
buffer_service_config{tool::get_config().memory_allocation_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
get_buffers().memory_allocation_trace},
|
||||
buffer_service_config{tool::get_config().rocdecode_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_ROCDECODE_API_EXT,
|
||||
get_buffers().rocdecode_api_trace},
|
||||
buffer_service_config{tool::get_config().rocjpeg_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_ROCJPEG_API,
|
||||
get_buffers().rocjpeg_api_trace}})
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
nullptr,
|
||||
&get_buffers().memory_copy_trace),
|
||||
"create memory copy buffer");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().memory_copy_trace),
|
||||
"buffer tracing service for memory copy configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().memory_allocation_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
nullptr,
|
||||
&get_buffers().memory_allocation_trace),
|
||||
"create memory allocation buffer");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().memory_allocation_trace),
|
||||
"buffer tracing service for memory allocation configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().scratch_memory_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().scratch_memory),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().scratch_memory),
|
||||
"buffer tracing service for scratch memory configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace ||
|
||||
tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().hsa_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
using optpair_t = std::pair<bool, rocprofiler_buffer_tracing_kind_t>;
|
||||
for(auto itr : {optpair_t{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_CORE_API},
|
||||
optpair_t{tool::get_config().hsa_amd_ext_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API},
|
||||
optpair_t{tool::get_config().hsa_image_ext_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API},
|
||||
optpair_t{tool::get_config().hsa_finalizer_ext_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API}})
|
||||
if(itr.option)
|
||||
{
|
||||
if(itr.first)
|
||||
// in sdk callback overhead benchmarking, we don't want to use the buffer services
|
||||
if(tool::get_config().benchmark_mode == tool::config::benchmark::sdk_callback_overhead)
|
||||
continue;
|
||||
|
||||
if(itr.buffer_id == null_buffer_id)
|
||||
{
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(), itr.second, nullptr, 0, get_buffers().hsa_api_trace),
|
||||
"buffer tracing service for hsa api configure");
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
callbacks.buffered_tracing,
|
||||
tool_data,
|
||||
&itr.buffer_id),
|
||||
"buffer creation");
|
||||
|
||||
ROCP_FATAL_IF(itr.buffer_id.handle == 0) << "failed to create buffer";
|
||||
|
||||
auto cb_thread = rocprofiler_callback_thread_t{};
|
||||
|
||||
ROCP_INFO << "creating dedicated callback thread for buffer "
|
||||
<< itr.buffer_id.handle;
|
||||
ROCPROFILER_CALL(rocprofiler_create_callback_thread(&cb_thread),
|
||||
"creating callback thread");
|
||||
|
||||
ROCP_INFO << "assigning buffer " << itr.buffer_id.handle << " to callback thread "
|
||||
<< cb_thread.handle;
|
||||
ROCPROFILER_CALL(rocprofiler_assign_callback_thread(itr.buffer_id, cb_thread),
|
||||
"assigning callback thread");
|
||||
}
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(), itr.kind, nullptr, 0, itr.buffer_id),
|
||||
"buffer tracing service configure");
|
||||
}
|
||||
}
|
||||
|
||||
struct callback_service_config
|
||||
{
|
||||
bool option = false;
|
||||
rocprofiler_callback_tracing_kind_t kind = ROCPROFILER_CALLBACK_TRACING_NONE;
|
||||
rocprofiler_callback_tracing_cb_t callback = nullptr;
|
||||
};
|
||||
|
||||
for(auto&& itr : {callback_service_config{tool::get_config().kernel_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().memory_copy_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().scratch_memory_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().hsa_amd_ext_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().hsa_image_ext_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().hsa_finalizer_ext_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().hip_runtime_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().hip_compiler_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().rccl_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_RCCL_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().memory_allocation_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().rocdecode_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API,
|
||||
dummy_callback_tracing_callback},
|
||||
callback_service_config{tool::get_config().rocjpeg_api_trace,
|
||||
ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API,
|
||||
dummy_callback_tracing_callback}})
|
||||
{
|
||||
if(itr.option)
|
||||
{
|
||||
// in sdk callback overhead benchmarking, we don't want to use the buffer services
|
||||
if(tool::get_config().benchmark_mode != tool::config::benchmark::sdk_callback_overhead)
|
||||
continue;
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
|
||||
get_client_ctx(), itr.kind, nullptr, 0, itr.callback, nullptr),
|
||||
"callback tracing service failed to configure");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1748,120 +1940,28 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
id,
|
||||
agent_params.data(),
|
||||
agent_params.size(),
|
||||
att_dispatch_callback,
|
||||
att_shader_data_callback,
|
||||
callbacks.att_dispatch,
|
||||
callbacks.att_shader_data,
|
||||
tool_data),
|
||||
"thread trace service configure");
|
||||
}
|
||||
}
|
||||
|
||||
if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().hip_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
if(tool::get_config().hip_runtime_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().hip_api_trace),
|
||||
"buffer tracing service for hip api configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().hip_compiler_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().hip_api_trace),
|
||||
"buffer tracing service for hip compiler api configure");
|
||||
}
|
||||
}
|
||||
|
||||
if(tool::get_config().rccl_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().rccl_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_RCCL_API,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().rccl_api_trace),
|
||||
"buffer tracing service for rccl api configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().counter_collection)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&counter_collection_ctx),
|
||||
"failed to create context");
|
||||
"failed to create counter collection context");
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_dispatch_counting_service(counter_collection_ctx,
|
||||
dispatch_callback,
|
||||
callbacks.counter_dispatch,
|
||||
nullptr,
|
||||
counter_record_callback,
|
||||
callbacks.counter_record,
|
||||
nullptr),
|
||||
"Could not setup counting service");
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(counter_collection_ctx), "start context failed");
|
||||
|
||||
start_context(counter_collection_ctx, "counter collection");
|
||||
}
|
||||
|
||||
if(tool::get_config().rocdecode_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().rocdecode_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_ROCDECODE_API_EXT,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().rocdecode_api_trace),
|
||||
"buffer tracing service for ROCDecode api configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().rocjpeg_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().rocjpeg_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_ROCJPEG_API,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().rocjpeg_api_trace),
|
||||
"buffer tracing service for ROCDecode api configure");
|
||||
}
|
||||
if(tool::get_config().kernel_rename)
|
||||
{
|
||||
auto rename_ctx = rocprofiler_context_id_t{0};
|
||||
@@ -1877,34 +1977,38 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
marker_core_api_kinds.data(),
|
||||
marker_core_api_kinds.size(),
|
||||
kernel_rename_callback,
|
||||
callbacks.kernel_rename,
|
||||
nullptr),
|
||||
"callback tracing service failed to configure");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(rename_ctx), "start context failed");
|
||||
start_context(rename_ctx, "kernel rename");
|
||||
}
|
||||
|
||||
if(!tool::get_config().group_by_queue)
|
||||
{
|
||||
// Track stream ID information via callback service
|
||||
auto hip_stream_display_ctx = rocprofiler_context_id_t{0};
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&hip_stream_display_ctx),
|
||||
"failed to create context");
|
||||
"failed to create hip stream context");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_tracing_service(hip_stream_display_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_STREAM,
|
||||
nullptr,
|
||||
0,
|
||||
hip_stream_display_callback,
|
||||
callbacks.hip_stream,
|
||||
nullptr),
|
||||
"stream tracing configure failed");
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(hip_stream_display_ctx), "start context failed");
|
||||
"hip stream tracing configure failed");
|
||||
|
||||
start_context(hip_stream_display_ctx, "hip stream");
|
||||
|
||||
// Track if HIP runtime has been initialized via runtime_intialization service
|
||||
auto runtime_initialization_ctx = rocprofiler_context_id_t{0};
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&runtime_initialization_ctx),
|
||||
"failed to create context");
|
||||
"failed to create runtime initialization context");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
|
||||
runtime_initialization_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION,
|
||||
@@ -1912,11 +2016,13 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
0,
|
||||
runtime_initialization_callback,
|
||||
nullptr),
|
||||
"stream tracing configure failed");
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(runtime_initialization_ctx),
|
||||
"start context failed");
|
||||
"runtime initialization tracing configure failed");
|
||||
|
||||
start_context(runtime_initialization_ctx, "runtime initialization");
|
||||
}
|
||||
if(tool::get_config().kernel_rename || !tool::get_config().group_by_queue)
|
||||
|
||||
if((tool::get_config().kernel_rename || !tool::get_config().group_by_queue) &&
|
||||
tool::get_config().benchmark_mode != tool::config::benchmark::execution_profile)
|
||||
{
|
||||
auto external_corr_id_request_kinds =
|
||||
std::array<rocprofiler_external_correlation_id_request_kind_t, 2>{
|
||||
@@ -1949,16 +2055,18 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
|
||||
if(tool::get_config().pc_sampling_host_trap)
|
||||
{
|
||||
configure_pc_sampling_on_all_agents(buffer_size, buffer_watermark, tool_data);
|
||||
configure_pc_sampling_on_all_agents(
|
||||
buffer_size, buffer_watermark, tool_data, callbacks.pc_sampling);
|
||||
}
|
||||
else if(tool::get_config().pc_sampling_stochastic)
|
||||
{
|
||||
configure_pc_sampling_on_all_agents(buffer_size, buffer_watermark, tool_data);
|
||||
configure_pc_sampling_on_all_agents(
|
||||
buffer_size, buffer_watermark, tool_data, callbacks.pc_sampling);
|
||||
}
|
||||
|
||||
for(auto itr : get_buffers().as_array())
|
||||
for(auto itr : get_buffers().pc_sampling_buffers_as_array())
|
||||
{
|
||||
if(itr.handle > 0)
|
||||
if(itr > null_buffer_id)
|
||||
{
|
||||
auto cb_thread = rocprofiler_callback_thread_t{};
|
||||
|
||||
@@ -1973,24 +2081,38 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
}
|
||||
}
|
||||
|
||||
if(tool::get_config().collection_periods.empty())
|
||||
{
|
||||
ROCPROFILER_CHECK(rocprofiler_start_context(get_client_ctx()));
|
||||
}
|
||||
else
|
||||
{
|
||||
auto _prom = std::promise<void>{};
|
||||
auto _fut = _prom.get_future();
|
||||
std::thread{collection_period_cntrl, std::move(_prom), get_client_ctx()}.detach();
|
||||
_fut.wait_for(std::chrono::seconds{1}); // wait for a max of 1 second
|
||||
}
|
||||
|
||||
// Handle kernel id of zero
|
||||
bool include = std::regex_search("0", std::regex(tool::get_config().kernel_filter_include));
|
||||
bool exclude = std::regex_search("0", std::regex(tool::get_config().kernel_filter_exclude));
|
||||
if(include && (!exclude || tool::get_config().kernel_filter_exclude.empty()))
|
||||
add_kernel_target(0, tool::get_config().kernel_filter_range);
|
||||
|
||||
if(tool::get_config().benchmark_mode == tool::config::benchmark::disabled_contexts_overhead)
|
||||
{
|
||||
ROCP_INFO << "rocprofv3 is not recording data because the overhead of inactive contexts is "
|
||||
"being benchmarked";
|
||||
}
|
||||
else if(tool::get_config().selected_regions)
|
||||
{
|
||||
ROCP_WARNING << "rocprofv3 is only recording profiling data within regions of code "
|
||||
"surrounded by roctxProfilerResume(0)/roctxProfilerPause";
|
||||
}
|
||||
else if(!tool::get_config().collection_periods.empty())
|
||||
{
|
||||
ROCP_INFO << "rocprofv3 will record data during the defined collection period(s)";
|
||||
|
||||
auto _prom = std::promise<void>{};
|
||||
auto _fut = _prom.get_future();
|
||||
std::thread{collection_period_cntrl, std::move(_prom), get_client_ctx()}.detach();
|
||||
_fut.wait_for(std::chrono::seconds{1}); // wait for a max of 1 second
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCP_INFO << "rocprofv3 will record data starting now";
|
||||
|
||||
start_context(get_client_ctx(), "primary rocprofv3");
|
||||
}
|
||||
|
||||
tool_metadata->process_id = getpid();
|
||||
rocprofiler_get_timestamp(&(tool_metadata->process_start_ns));
|
||||
|
||||
@@ -2008,6 +2130,90 @@ struct output_data
|
||||
uint64_t num_bytes = 0;
|
||||
};
|
||||
|
||||
void
|
||||
generate_config_output(const tool::config& cfg, const tool::metadata& tool_metadata_v)
|
||||
{
|
||||
using JSONOutputArchive = ::cereal::PrettyJSONOutputArchive;
|
||||
|
||||
constexpr auto json_prec = 16;
|
||||
constexpr auto json_indent = JSONOutputArchive::Options::IndentChar::space;
|
||||
auto json_opts = JSONOutputArchive::Options{json_prec, json_indent, 2};
|
||||
auto filename = std::string_view{"config"};
|
||||
|
||||
auto stream = get_output_stream(cfg, filename, ".json");
|
||||
{
|
||||
auto archive = JSONOutputArchive{*stream.stream, json_opts};
|
||||
|
||||
archive.setNextName("rocprofiler-sdk-tool");
|
||||
archive.startNode();
|
||||
archive.makeArray();
|
||||
archive.startNode(); // first array entry
|
||||
|
||||
auto timestamps =
|
||||
tool::timestamps_t{tool_metadata_v.process_start_ns, tool_metadata_v.process_end_ns};
|
||||
|
||||
auto this_pid = tool_metadata_v.process_id;
|
||||
|
||||
archive.setNextName("metadata");
|
||||
archive.startNode();
|
||||
archive(cereal::make_nvp("pid", this_pid));
|
||||
archive(cereal::make_nvp("init_time", timestamps.app_start_time));
|
||||
archive(cereal::make_nvp("fini_time", timestamps.app_end_time));
|
||||
archive(cereal::make_nvp("config", cfg));
|
||||
archive(cereal::make_nvp("command", common::read_command_line(this_pid)));
|
||||
|
||||
{
|
||||
archive.setNextName("build_spec");
|
||||
archive.startNode();
|
||||
archive(cereal::make_nvp("version_major", ROCPROFILER_VERSION_MAJOR));
|
||||
archive(cereal::make_nvp("version_minor", ROCPROFILER_VERSION_MINOR));
|
||||
archive(cereal::make_nvp("version_patch", ROCPROFILER_VERSION_PATCH));
|
||||
archive(cereal::make_nvp("soversion", ROCPROFILER_SOVERSION));
|
||||
archive(cereal::make_nvp("compiler_id", std::string{ROCPROFILER_COMPILER_ID}));
|
||||
archive(
|
||||
cereal::make_nvp("compiler_version", std::string{ROCPROFILER_COMPILER_VERSION}));
|
||||
archive(cereal::make_nvp("git_describe", std::string{ROCPROFILER_GIT_DESCRIBE}));
|
||||
archive(cereal::make_nvp("git_revision", std::string{ROCPROFILER_GIT_REVISION}));
|
||||
archive(cereal::make_nvp("library_arch", std::string{ROCPROFILER_LIBRARY_ARCH}));
|
||||
archive(cereal::make_nvp("system_name", std::string{ROCPROFILER_SYSTEM_NAME}));
|
||||
archive(
|
||||
cereal::make_nvp("system_processor", std::string{ROCPROFILER_SYSTEM_PROCESSOR}));
|
||||
archive(cereal::make_nvp("system_version", std::string{ROCPROFILER_SYSTEM_VERSION}));
|
||||
archive.finishNode(); // build_spec
|
||||
}
|
||||
|
||||
// save the execution profile
|
||||
if(execution_profile) archive(cereal::make_nvp("profile", execution_profile->get()));
|
||||
|
||||
// save the environment variables
|
||||
{
|
||||
archive.setNextName("environment");
|
||||
archive.startNode();
|
||||
size_t idx = 0;
|
||||
while(true)
|
||||
{
|
||||
const auto* env_entry = environ[idx++];
|
||||
if(!env_entry)
|
||||
break;
|
||||
else if(std::string_view{env_entry}.find('=') != std::string_view::npos)
|
||||
{
|
||||
auto _entry = std::string{env_entry};
|
||||
auto _pos = _entry.find('=');
|
||||
auto _name = _entry.substr(0, _pos);
|
||||
auto _value = _entry.substr(_pos + 1);
|
||||
archive(cereal::make_nvp(_name.c_str(), _value));
|
||||
}
|
||||
}
|
||||
archive.finishNode();
|
||||
}
|
||||
|
||||
archive.finishNode(); // metadata
|
||||
archive.finishNode(); // first array entry
|
||||
archive.finishNode(); // rocprofiler-sdk-tool
|
||||
}
|
||||
stream.close();
|
||||
}
|
||||
|
||||
template <typename Tp, domain_type DomainT>
|
||||
void
|
||||
generate_output(tool::buffered_output<Tp, DomainT>& output_v,
|
||||
@@ -2019,6 +2225,9 @@ generate_output(tool::buffered_output<Tp, DomainT>& output_v,
|
||||
|
||||
if(!output_v) return;
|
||||
|
||||
// when benchmarking, we do not generate output
|
||||
if(tool::get_config().benchmark_mode != tool::config::benchmark::none) return;
|
||||
|
||||
// opens temporary file and sets read position to beginning
|
||||
output_v.read();
|
||||
|
||||
@@ -2104,6 +2313,12 @@ tool_fini(void* /*tool_data*/)
|
||||
cleanups.clear();
|
||||
};
|
||||
|
||||
// generate the configuration output regardless of whether there is any data
|
||||
if(tool::get_config().output_config_file)
|
||||
{
|
||||
generate_config_output(tool::get_config(), *tool_metadata);
|
||||
}
|
||||
|
||||
auto _dtor = common::scope_destructor{run_cleanup};
|
||||
|
||||
generate_output(kernel_dispatch_output, outdata, contributions, cleanups);
|
||||
@@ -2662,6 +2877,7 @@ rocprofiler_configure(uint32_t version,
|
||||
|
||||
// ensure these pointers are not leaked
|
||||
add_destructor(tool_metadata);
|
||||
add_destructor(execution_profile);
|
||||
|
||||
// in case main wrapper is not used
|
||||
::atexit([]() { finalize_rocprofv3("atexit"); });
|
||||
|
||||
@@ -20,13 +20,13 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <rocprofiler-sdk/external_correlation.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
#include "lib/rocprofiler-sdk/external_correlation.hpp"
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/external_correlation.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/external_correlation.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -36,6 +36,50 @@ namespace external_correlation
|
||||
{
|
||||
namespace
|
||||
{
|
||||
#define ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(CODE) \
|
||||
template <> \
|
||||
struct external_correlation_id_request_kind_string< \
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_##CODE> \
|
||||
{ \
|
||||
static constexpr auto value = \
|
||||
std::pair<const char*, size_t>{#CODE, std::string_view{#CODE}.length()}; \
|
||||
};
|
||||
|
||||
template <size_t Idx>
|
||||
struct external_correlation_id_request_kind_string;
|
||||
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(NONE)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_CORE_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_AMD_EXT_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_IMAGE_EXT_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HSA_FINALIZE_EXT_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HIP_RUNTIME_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(HIP_COMPILER_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MARKER_CORE_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MARKER_CONTROL_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MARKER_NAME_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MEMORY_COPY)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(MEMORY_ALLOCATION)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(KERNEL_DISPATCH)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(SCRATCH_MEMORY)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(RCCL_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(OMPT)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(ROCDECODE_API)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING(ROCJPEG_API)
|
||||
|
||||
#undef ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KIND_STRING
|
||||
|
||||
template <size_t Idx, size_t... Tail>
|
||||
std::pair<const char*, size_t>
|
||||
get_kind_name(rocprofiler_external_correlation_id_request_kind_t kind,
|
||||
std::index_sequence<Idx, Tail...>)
|
||||
{
|
||||
if(kind == Idx) return external_correlation_id_request_kind_string<Idx>::value;
|
||||
// recursion until tail empty
|
||||
if constexpr(sizeof...(Tail) > 0) return get_kind_name(kind, std::index_sequence<Tail...>{});
|
||||
return {nullptr, 0};
|
||||
}
|
||||
|
||||
auto
|
||||
get_default_tid()
|
||||
{
|
||||
@@ -249,6 +293,21 @@ rocprofiler_configure_external_correlation_id_request_service(
|
||||
callback, callback_args, kinds_v);
|
||||
}
|
||||
|
||||
rocprofiler_status_t
|
||||
rocprofiler_query_external_correlation_id_request_kind_name(
|
||||
rocprofiler_external_correlation_id_request_kind_t kind,
|
||||
const char** name,
|
||||
uint64_t* name_len)
|
||||
{
|
||||
auto&& val = rocprofiler::external_correlation::get_kind_name(
|
||||
kind, std::make_index_sequence<ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_LAST>{});
|
||||
|
||||
if(name) *name = val.first;
|
||||
if(name_len) *name_len = val.second;
|
||||
|
||||
return (val.first) ? ROCPROFILER_STATUS_SUCCESS : ROCPROFILER_STATUS_ERROR_KIND_NOT_FOUND;
|
||||
}
|
||||
|
||||
rocprofiler_status_t
|
||||
rocprofiler_push_external_correlation_id(rocprofiler_context_id_t context,
|
||||
rocprofiler_thread_id_t tid,
|
||||
|
||||
@@ -42,6 +42,7 @@ namespace context
|
||||
{
|
||||
struct context;
|
||||
}
|
||||
|
||||
namespace external_correlation
|
||||
{
|
||||
static constexpr bool enable_const_wlock_v = true;
|
||||
|
||||
@@ -688,6 +688,33 @@ initialize()
|
||||
if(get_num_clients() > 0) internal_threading::initialize();
|
||||
// initialization is no longer available
|
||||
set_init_status(1);
|
||||
|
||||
if(get_num_clients() > 0)
|
||||
{
|
||||
for(const auto& itr : *get_clients())
|
||||
{
|
||||
if(!itr) continue;
|
||||
size_t _client_registered_ctx = 0;
|
||||
for(const auto* citr : context::get_registered_contexts())
|
||||
{
|
||||
if(citr->client_idx == itr->internal_client_id.handle) ++_client_registered_ctx;
|
||||
}
|
||||
|
||||
size_t _client_activated_ctx = 0;
|
||||
for(const auto* citr : context::get_active_contexts())
|
||||
{
|
||||
if(citr->client_idx == itr->internal_client_id.handle) ++_client_activated_ctx;
|
||||
}
|
||||
|
||||
ROCP_INFO << fmt::format("rocprofiler-sdk client '{}' registered {} context(s) and "
|
||||
"started {} context(s)",
|
||||
(itr->mutable_client_id.name)
|
||||
? std::string_view{itr->mutable_client_id.name}
|
||||
: std::string_view{"unspecified"},
|
||||
_client_registered_ctx,
|
||||
_client_activated_ctx);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ find_package(rocJPEG)
|
||||
|
||||
# applications used by integration tests which DO link to rocprofiler-sdk-roctx
|
||||
add_subdirectory(reproducible-runtime)
|
||||
add_subdirectory(reproducible-dispatch-count)
|
||||
add_subdirectory(transpose)
|
||||
add_subdirectory(openmp)
|
||||
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-sdk-tests-bin-reproducible-dispatch-count LANGUAGES CXX HIP)
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE MATCHES "(Release|RelWithDebInfo)")
|
||||
set(CMAKE_BUILD_TYPE "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
option(REPRODUCIBLE_DISPATCH_COUNT_USE_MPI
|
||||
"Enable MPI support in reproducible-dispatch-count exe" OFF)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(reproducible-dispatch-count.cpp PROPERTIES LANGUAGE HIP)
|
||||
add_executable(reproducible-dispatch-count)
|
||||
target_sources(reproducible-dispatch-count PRIVATE reproducible-dispatch-count.cpp)
|
||||
target_compile_options(reproducible-dispatch-count PRIVATE -W -Wall -Wextra -Wpedantic
|
||||
-Wshadow -Werror)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(reproducible-dispatch-count PRIVATE Threads::Threads)
|
||||
|
||||
find_package(rocprofiler-sdk-roctx REQUIRED)
|
||||
target_link_libraries(reproducible-dispatch-count
|
||||
PRIVATE rocprofiler-sdk-roctx::rocprofiler-sdk-roctx)
|
||||
|
||||
if(REPRODUCIBLE_DISPATCH_COUNT_USE_MPI)
|
||||
find_package(MPI REQUIRED)
|
||||
target_compile_definitions(reproducible-dispatch-count PRIVATE USE_MPI)
|
||||
target_link_libraries(reproducible-dispatch-count PRIVATE MPI::MPI_C)
|
||||
endif()
|
||||
+254
@@ -0,0 +1,254 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "rocprofiler-sdk-roctx/roctx.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(USE_MPI)
|
||||
# include <mpi.h>
|
||||
#endif
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error %i : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
static_cast<int>(error_), \
|
||||
hipGetErrorString(error_)); \
|
||||
throw std::runtime_error("hip_api_call"); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
size_t niterations = 1000;
|
||||
uint32_t nspin = 4 * 10000;
|
||||
size_t nsync = 1;
|
||||
size_t nthreads = 2;
|
||||
|
||||
void
|
||||
check_hip_error(void);
|
||||
} // namespace
|
||||
|
||||
__global__ void
|
||||
reproducible_dispatch_count(uint32_t nspin);
|
||||
|
||||
void
|
||||
run(int tid, int devid);
|
||||
|
||||
void
|
||||
run_nsync(int tid, int devid);
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
auto _arg = std::string{argv[i]};
|
||||
if(_arg == "?" || _arg == "-h" || _arg == "--help")
|
||||
{
|
||||
fprintf(stderr,
|
||||
"usage: reproducible-dispatch-count [KERNEL ITERATIONS PER THREAD (default: "
|
||||
"%zu msec)] [NUM_THREADS (default: %zu)] [SPIN CYCLES PER KERNEL LAUNCH "
|
||||
"(default: %u)] [ITERATION PER SYNC (default: %zu)\n",
|
||||
niterations,
|
||||
nthreads,
|
||||
nspin,
|
||||
nsync);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(argc > 1) niterations = std::stoll(argv[1]);
|
||||
if(argc > 2) nthreads = std::stoll(argv[2]);
|
||||
if(argc > 3) nspin = std::stoll(argv[3]);
|
||||
if(argc > 4) nsync = std::stoll(argv[4]);
|
||||
|
||||
printf("[reproducible-dispatch-count] Kernel dispatches per thread: %zu\n", niterations);
|
||||
printf("[reproducible-dispatch-count] Spin time per kernel: %u cycles\n", nspin);
|
||||
printf("[reproducible-dispatch-count] Number of threads: %zu\n", nthreads);
|
||||
printf("[reproducible-dispatch-count] Iterations per sync: %zu\n", nsync);
|
||||
|
||||
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
|
||||
int ndevice = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&ndevice));
|
||||
printf("[reproducible-dispatch-count] Number of devices found: %i\n", ndevice);
|
||||
auto _threads = std::vector<std::thread>{};
|
||||
for(size_t i = 0; i < nthreads; ++i)
|
||||
{
|
||||
if(nsync <= 1)
|
||||
_threads.emplace_back(run, i, i % ndevice);
|
||||
else
|
||||
_threads.emplace_back(run_nsync, i, i % ndevice);
|
||||
}
|
||||
for(auto& itr : _threads)
|
||||
itr.join();
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
HIP_API_CALL(hipDeviceReset());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
reproducible_dispatch_count(uint32_t nspin_v)
|
||||
{
|
||||
for(uint32_t i = 0; i < nspin_v / 64; i++)
|
||||
asm volatile("s_sleep 1");
|
||||
if(nspin_v > 64)
|
||||
for(uint32_t i = 0; i < nspin_v % 64; i++)
|
||||
asm volatile("s_sleep 1");
|
||||
}
|
||||
|
||||
void
|
||||
run(int tid, int devid)
|
||||
{
|
||||
auto roctx_range_id = roctxRangeStart("run");
|
||||
|
||||
constexpr int min_avail_simd = 128;
|
||||
dim3 grid(min_avail_simd);
|
||||
dim3 block(32);
|
||||
double time = 0.0;
|
||||
hipStream_t stream = {};
|
||||
hipEvent_t start = {};
|
||||
hipEvent_t stop = {};
|
||||
uint64_t nlaunch = 0;
|
||||
|
||||
HIP_API_CALL(hipSetDevice(devid));
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
HIP_API_CALL(hipEventCreate(&start));
|
||||
HIP_API_CALL(hipEventCreate(&stop));
|
||||
|
||||
for(size_t i = 0; i < niterations; ++i)
|
||||
{
|
||||
roctxMark("iteration");
|
||||
HIP_API_CALL(hipEventRecord(start, stream));
|
||||
reproducible_dispatch_count<<<grid, block, 0, stream>>>(nspin);
|
||||
HIP_API_CALL(hipEventRecord(stop, stream));
|
||||
check_hip_error();
|
||||
HIP_API_CALL(hipEventSynchronize(stop));
|
||||
float elapsed = 0.0f;
|
||||
HIP_API_CALL(hipEventElapsedTime(&elapsed, start, stop));
|
||||
time += static_cast<double>(elapsed);
|
||||
++nlaunch;
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipEventDestroy(start));
|
||||
HIP_API_CALL(hipEventDestroy(stop));
|
||||
|
||||
{
|
||||
auto _msg = std::stringstream{};
|
||||
_msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-dispatch-count is "
|
||||
<< std::setprecision(2) << std::fixed << time << " ms (" << std::setprecision(3)
|
||||
<< (time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n";
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << _msg.str() << std::flush;
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipStreamDestroy(stream));
|
||||
|
||||
roctxRangeStop(roctx_range_id);
|
||||
}
|
||||
|
||||
void
|
||||
run_nsync(int tid, int devid)
|
||||
{
|
||||
auto roctx_range_id = roctxRangeStart("run");
|
||||
|
||||
constexpr int min_avail_simd = 128;
|
||||
dim3 grid(min_avail_simd);
|
||||
dim3 block(32);
|
||||
hipStream_t stream = {};
|
||||
uint64_t nlaunch = 0;
|
||||
|
||||
HIP_API_CALL(hipSetDevice(devid));
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
|
||||
auto _elapsed = std::chrono::steady_clock::duration{};
|
||||
auto _beg = std::chrono::steady_clock::now();
|
||||
for(size_t i = 0; i < niterations; ++i)
|
||||
{
|
||||
roctxMark("iteration");
|
||||
reproducible_dispatch_count<<<grid, block, 0, stream>>>(nspin);
|
||||
if((i % nsync) == (nsync - 1))
|
||||
{
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
auto _end = std::chrono::steady_clock::now();
|
||||
_elapsed += (_end - _beg);
|
||||
_beg = std::chrono::steady_clock::now();
|
||||
}
|
||||
++nlaunch;
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
auto _end = std::chrono::steady_clock::now();
|
||||
_elapsed += (_end - _beg);
|
||||
|
||||
{
|
||||
auto _time =
|
||||
std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(_elapsed).count();
|
||||
auto _msg = std::stringstream{};
|
||||
_msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-dispatch-count is "
|
||||
<< std::setprecision(2) << std::fixed << _time << " ms (" << std::setprecision(3)
|
||||
<< (_time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n";
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << _msg.str() << std::flush;
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipStreamDestroy(stream));
|
||||
|
||||
roctxRangeStop(roctx_range_id);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
void
|
||||
check_hip_error(void)
|
||||
{
|
||||
hipError_t err = hipGetLastError();
|
||||
if(err != hipSuccess)
|
||||
{
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("hip_api_call");
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele