From 59d3399901b0933cbbd6e48901aa331a8785d418 Mon Sep 17 00:00:00 2001 From: Sajina PK Date: Fri, 21 Feb 2025 10:25:01 -0500 Subject: [PATCH] JPEG Activity tracing in Perfetto (#108) - Add JPEG activity track in perfetto trace - Add JPEG decode tests to the examples - Change existing videodecode test to include JPEG testing - Rename videodecode test file to decode to include jpeg tests too - Fix a bug in the test which checks for total activity of 0 - Disable rocDecode and rocJPEG samples from the github image files --- .github/workflows/opensuse.yml | 2 +- .github/workflows/redhat.yml | 2 +- .github/workflows/ubuntu-focal.yml | 8 +- .github/workflows/ubuntu-jammy.yml | 2 +- cmake/ConfigCPack.cmake | 1 + examples/CMakeLists.txt | 1 + examples/jpegdecode/CMakeLists.txt | 132 +++ examples/jpegdecode/jpegdecodeperf.cpp | 420 ++++++++ examples/jpegdecode/rocjpeg_samples_utils.h | 903 ++++++++++++++++++ examples/videodecode/CMakeLists.txt | 4 +- source/lib/core/categories.hpp | 2 + source/lib/core/components/fwd.hpp | 13 + source/lib/core/config.cpp | 18 +- .../rocprofiler-systems/categories.h | 3 +- source/lib/rocprof-sys/library/rocm_smi.cpp | 84 +- source/lib/rocprof-sys/library/rocm_smi.hpp | 30 +- source/lib/rocprof-sys/library/sampling.cpp | 9 +- tests/CMakeLists.txt | 2 +- tests/rocprof-sys-decode-tests.cmake | 49 + tests/rocprof-sys-videodecode-tests.cmake | 22 - tests/validate-perfetto-proto.py | 2 +- 21 files changed, 1637 insertions(+), 72 deletions(-) create mode 100644 examples/jpegdecode/CMakeLists.txt create mode 100644 examples/jpegdecode/jpegdecodeperf.cpp create mode 100644 examples/jpegdecode/rocjpeg_samples_utils.h create mode 100644 tests/rocprof-sys-decode-tests.cmake delete mode 100644 tests/rocprof-sys-videodecode-tests.cmake diff --git a/.github/workflows/opensuse.yml b/.github/workflows/opensuse.yml index 3062b89b86..3559224af5 100644 --- a/.github/workflows/opensuse.yml +++ b/.github/workflows/opensuse.yml @@ -102,7 +102,7 @@ jobs: -DROCPROFSYS_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10;py3.11" -DROCPROFSYS_CI_MPI_RUN_AS_ROOT=ON -DROCPROFSYS_MAX_THREADS=64 - -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode" + -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode;jpegdecode" -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} - name: Install diff --git a/.github/workflows/redhat.yml b/.github/workflows/redhat.yml index 06aeaa9dfa..8fca7c63fb 100644 --- a/.github/workflows/redhat.yml +++ b/.github/workflows/redhat.yml @@ -121,7 +121,7 @@ jobs: -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl" -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} -- - -LE "transpose|rccl|videodecode" + -LE "transpose|rccl|videodecode|jpegdecode" - name: Install timeout-minutes: 10 diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index bdc482a890..45423914ff 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -157,7 +157,7 @@ jobs: -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs -DROCPROFSYS_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10;py3.11" -DROCPROFSYS_MAX_THREADS=64 - -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode" + -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode;jpegdecode" -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} -DMPI_HEADERS_ALLOW_MPICH=OFF @@ -248,7 +248,7 @@ jobs: rocm-version: ['6.3'] mpi-headers: ['OFF'] build-jobs: ['3'] - ctest-exclude: ['-LE "transpose|videodecode"'] + ctest-exclude: ['-LE "transpose|videodecode|jpegdecode"'] env: BUILD_TYPE: MinSizeRel @@ -489,7 +489,7 @@ jobs: -DDYNINST_BUILD_STATIC_LIBS=OFF -DDYNINST_ELFUTILS_DOWNLOAD_VERSION=${{ env.ELFUTILS_DOWNLOAD_VERSION }} -DROCPROFSYS_MAX_THREADS=64 - -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode" + -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode;jpegdecode" -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} -DMPI_HEADERS_ALLOW_MPICH=ON @@ -628,5 +628,5 @@ jobs: -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_RCCL=OFF -DROCPROFSYS_MAX_THREADS=64 - -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode" + -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;videodecode;jpegdecode" -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} diff --git a/.github/workflows/ubuntu-jammy.yml b/.github/workflows/ubuntu-jammy.yml index 9a6b72ab2e..0cde63c860 100644 --- a/.github/workflows/ubuntu-jammy.yml +++ b/.github/workflows/ubuntu-jammy.yml @@ -190,7 +190,7 @@ jobs: -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} -DUSE_CLANG_OMP=OFF -- - -LE "transpose|rccl|videodecode" + -LE "transpose|rccl|videodecode|jpegdecode" - name: Install timeout-minutes: 10 diff --git a/cmake/ConfigCPack.cmake b/cmake/ConfigCPack.cmake index 27211c5cec..99eead914e 100644 --- a/cmake/ConfigCPack.cmake +++ b/cmake/ConfigCPack.cmake @@ -173,6 +173,7 @@ if(ROCPROFSYS_USE_MPI) endif() if(ROCPROFSYS_BUILD_TESTING) list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocdecode-test") + list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocjpeg-test") endif() string(REPLACE ";" ", " _DEBIAN_PACKAGE_DEPENDS "${_DEBIAN_PACKAGE_DEPENDS}") set(CPACK_DEBIAN_PACKAGE_DEPENDS diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a4631091e3..388954f653 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -57,3 +57,4 @@ add_subdirectory(causal) add_subdirectory(trace-time-window) add_subdirectory(fork) add_subdirectory(videodecode) +add_subdirectory(jpegdecode) diff --git a/examples/jpegdecode/CMakeLists.txt b/examples/jpegdecode/CMakeLists.txt new file mode 100644 index 0000000000..fd4a8c46b2 --- /dev/null +++ b/examples/jpegdecode/CMakeLists.txt @@ -0,0 +1,132 @@ +################################################################################ +# Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR) + +# This example requires hip and rocjpeg. +find_package(HIP QUIET) + +if(NOT HIP_FOUND) + message(WARNING "hip is not found. Skip jpegdecode example.") + return() +endif() + +# Set AMD Clang as default compiler +if(NOT DEFINED CMAKE_CXX_COMPILER) + set(CMAKE_C_COMPILER ${ROCmVersion_DIR}/bin/amdclang) + set(CMAKE_CXX_COMPILER ${ROCmVersion_DIR}/bin/amdclang++) +endif() + +project(rocprofiler-systems-jpegdecode-example) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED On) + +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake) +list(APPEND CMAKE_PREFIX_PATH ${ROCmVersion_DIR}/hip ${ROCmVersion_DIR}) +list(APPEND CMAKE_MODULE_PATH ${ROCmVersion_DIR}/share/rocjpeg/cmake) + +set(CMAKE_BUILD_TYPE "RelWithDebInfo") +string(REPLACE " " ";" _FLAGS "${CMAKE_CXX_FLAGS_DEBUG}") + +if(ROCPROFSYS_DISABLE_EXAMPLES) + get_filename_component(_DIR ${CMAKE_CURRENT_LIST_DIR} NAME) + + if(${PROJECT_NAME} IN_LIST ROCPROFSYS_DISABLE_EXAMPLES OR ${_DIR} IN_LIST + ROCPROFSYS_DISABLE_EXAMPLES) + return() + endif() +endif() + +# Copy image files to build directory +if(EXISTS "${ROCmVersion_DIR}/share/rocjpeg/images") + if(NOT EXISTS "${CMAKE_BINARY_DIR}/images") + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/images") + endif() + + file(GLOB_RECURSE image_files "${ROCmVersion_DIR}/share/rocjpeg/images/*") + file(COPY ${image_files} DESTINATION ${CMAKE_BINARY_DIR}/images) +else() + message( + AUTHOR_WARNING + "Source directory ${ROCmVersion_DIR}/share/rocjpeg/images does not exist") +endif() + +find_package(rocJPEG QUIET) +find_package(rocprofiler-register QUIET) + +# threads +find_package(Threads REQUIRED) + +if(HIP_FOUND + AND ROCJPEG_FOUND + AND Threads_FOUND + AND rocprofiler-register_FOUND) + # HIP + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} hip::host) + # threads + set(THREADS_PREFER_PTHREAD_FLAG ON) + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} Threads::Threads) + # std filesystem + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs) + # rocprofiler-register + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} rocprofiler-register::rocprofiler-register) + + # rocJPEG + include_directories(${ROCJPEG_INCLUDE_DIR}) + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} ${ROCJPEG_LIBRARY}) + list(APPEND SOURCES ${PROJECT_SOURCE_DIR} jpegdecodeperf.cpp) + add_executable(jpegdecode ${SOURCES}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") + target_link_libraries(jpegdecode ${LINK_LIBRARY_LIST}) + target_compile_options(jpegdecode PRIVATE ${_FLAGS}) + + if(ROCPROFSYS_INSTALL_EXAMPLES) + install( + TARGETS jpegdecode + DESTINATION bin + COMPONENT rocprofiler-systems-examples) + install( + FILES ${CMAKE_BINARY_DIR}/images + DESTINATION share/rocprofiler-systems/tests/images + COMPONENT rocprofiler-systems-examples) + endif() +else() + message( + "-- ERROR!: ${PROJECT_NAME} excluded! please install all the dependencies and try again!" + ) + if(NOT HIP_FOUND) + message(FATAL_ERROR "-- ERROR!: HIP Not Found! - please install ROCm and HIP!") + endif() + if(NOT ROCJPEG_FOUND) + message(WARNING "-- ERROR!: rocJPEG Not Found! - please install rocJPEG!") + endif() + if(NOT Threads_FOUND) + message(FATAL_ERROR "-- ERROR!: Threads Not Found! - please insatll Threads!") + endif() + if(NOT rocprofiler-register_FOUND) + message( + FATAL_ERROR + "-- ERROR!: rocprofiler-register Not Found! - please install rocprofiler-register!" + ) + endif() +endif() diff --git a/examples/jpegdecode/jpegdecodeperf.cpp b/examples/jpegdecode/jpegdecodeperf.cpp new file mode 100644 index 0000000000..32472c3f45 --- /dev/null +++ b/examples/jpegdecode/jpegdecodeperf.cpp @@ -0,0 +1,420 @@ +/* +Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "rocjpeg_samples_utils.h" + +struct DecodeInfo +{ + std::vector file_paths; + RocJpegHandle rocjpeg_handle; + std::vector rocjpeg_stream_handles; + uint64_t num_decoded_images; + double images_per_sec; + double image_size_in_mpixels_per_sec; + uint64_t num_bad_jpegs; + uint64_t num_jpegs_with_411_subsampling; + uint64_t num_jpegs_with_unknown_subsampling; + uint64_t num_jpegs_with_unsupported_resolution; +}; + +/** + * @brief Decodes a batch of JPEG images and optionally saves the decoded images. + * + * @param decode_info parameters info for decoding a batch of jpeg images. + * @param rocjpeg_utils Utility functions for RocJpeg operations. + * @param decode_params Parameters for decoding the JPEG images (output_format, + * crop_rectangle) + * @param save_images A boolean flag indicating whether to save the decoded images. + * @param output_file_path The file path where the decoded images will be saved. + * @param batch_size The number of images to be processed in each batch. + */ +void +DecodeImages(DecodeInfo& decode_info, RocJpegUtils rocjpeg_utils, + RocJpegDecodeParams& decode_params, bool save_images, + std::string& output_file_path, int batch_size, int device_id) +{ + bool is_roi_valid = false; + uint32_t roi_width; + uint32_t roi_height; + roi_width = decode_params.crop_rectangle.right - decode_params.crop_rectangle.left; + roi_height = decode_params.crop_rectangle.bottom - decode_params.crop_rectangle.top; + uint8_t num_components; + uint32_t channel_sizes[ROCJPEG_MAX_COMPONENT] = {}; + std::string chroma_sub_sampling = ""; + uint32_t num_channels = 0; + double image_size_in_mpixels_all = 0; + double total_decode_time_in_milli_sec = 0; + int current_batch_size = 0; + std::vector> batch_images(batch_size); + std::vector> widths( + batch_size, std::vector(ROCJPEG_MAX_COMPONENT, 0)); + std::vector> heights( + batch_size, std::vector(ROCJPEG_MAX_COMPONENT, 0)); + std::vector> prior_channel_sizes( + batch_size, std::vector(ROCJPEG_MAX_COMPONENT, 0)); + std::vector subsamplings(batch_size); + std::vector output_images(batch_size); + std::vector base_file_names(batch_size); + std::vector rocjpeg_stream_handles(batch_size); + std::vector temp_widths(ROCJPEG_MAX_COMPONENT, 0); + std::vector temp_heights(ROCJPEG_MAX_COMPONENT, 0); + RocJpegChromaSubsampling temp_subsampling; + std::string temp_base_file_name; + + CHECK_HIP(hipSetDevice(device_id)); + for(int i = 0; i < decode_info.file_paths.size(); i += batch_size) + { + int batch_end = + std::min(i + batch_size, static_cast(decode_info.file_paths.size())); + for(int j = i; j < batch_end; j++) + { + int index = j - i; + + temp_base_file_name = decode_info.file_paths[j].substr( + decode_info.file_paths[j].find_last_of("/\\") + 1); + // Read an image from disk. + std::ifstream input(decode_info.file_paths[j].c_str(), + std::ios::in | std::ios::binary | std::ios::ate); + if(!(input.is_open())) + { + std::cerr << "ERROR: Cannot open image: " << decode_info.file_paths[j] + << std::endl; + return; + } + // Get the size + std::streamsize file_size = input.tellg(); + input.seekg(0, std::ios::beg); + // resize if buffer is too small + if(batch_images[index].size() < file_size) + { + batch_images[index].resize(file_size); + } + if(!input.read(batch_images[index].data(), file_size)) + { + std::cerr << "ERROR: Cannot read from file: " << decode_info.file_paths[j] + << std::endl; + return; + } + + RocJpegStatus rocjpeg_status = + rocJpegStreamParse(reinterpret_cast(batch_images[index].data()), + file_size, decode_info.rocjpeg_stream_handles[index]); + if(rocjpeg_status != ROCJPEG_STATUS_SUCCESS) + { + decode_info.num_bad_jpegs++; + std::cerr << "Skipping decoding input file: " << decode_info.file_paths[j] + << std::endl; + continue; + } + + CHECK_ROCJPEG(rocJpegGetImageInfo(decode_info.rocjpeg_handle, + decode_info.rocjpeg_stream_handles[index], + &num_components, &temp_subsampling, + temp_widths.data(), temp_heights.data())); + if(roi_width > 0 && roi_height > 0 && roi_width <= temp_widths[0] && + roi_height <= temp_heights[0]) + { + is_roi_valid = true; + } + + rocjpeg_utils.GetChromaSubsamplingStr(temp_subsampling, chroma_sub_sampling); + if(temp_widths[0] < 64 || temp_heights[0] < 64) + { + decode_info.num_jpegs_with_unsupported_resolution++; + continue; + } + + if(temp_subsampling == ROCJPEG_CSS_411 || + temp_subsampling == ROCJPEG_CSS_UNKNOWN) + { + if(temp_subsampling == ROCJPEG_CSS_411) + { + decode_info.num_jpegs_with_411_subsampling++; + } + if(temp_subsampling == ROCJPEG_CSS_UNKNOWN) + { + decode_info.num_jpegs_with_unknown_subsampling++; + } + continue; + } + + if(rocjpeg_utils.GetChannelPitchAndSizes( + decode_params, temp_subsampling, temp_widths.data(), + temp_heights.data(), num_channels, output_images[current_batch_size], + channel_sizes)) + { + std::cerr << "ERROR: Failed to get the channel pitch and sizes" + << std::endl; + return; + } + + // allocate memory for each channel and reuse them if the sizes remain + // unchanged for a new image. + for(int n = 0; n < num_channels; n++) + { + if(prior_channel_sizes[current_batch_size][n] != channel_sizes[n]) + { + if(output_images[current_batch_size].channel[n] != nullptr) + { + CHECK_HIP(hipFree( + (void*) output_images[current_batch_size].channel[n])); + output_images[current_batch_size].channel[n] = nullptr; + } + CHECK_HIP(hipMalloc(&output_images[current_batch_size].channel[n], + channel_sizes[n])); + prior_channel_sizes[current_batch_size][n] = channel_sizes[n]; + } + } + + rocjpeg_stream_handles[current_batch_size] = + decode_info.rocjpeg_stream_handles[index]; + subsamplings[current_batch_size] = temp_subsampling; + widths[current_batch_size] = temp_widths; + heights[current_batch_size] = temp_heights; + base_file_names[current_batch_size] = temp_base_file_name; + current_batch_size++; + } + + double time_per_batch_in_milli_sec = 0; + if(current_batch_size > 0) + { + auto start_time = std::chrono::high_resolution_clock::now(); + CHECK_ROCJPEG(rocJpegDecodeBatched( + decode_info.rocjpeg_handle, rocjpeg_stream_handles.data(), + current_batch_size, &decode_params, output_images.data())); + auto end_time = std::chrono::high_resolution_clock::now(); + time_per_batch_in_milli_sec = + std::chrono::duration(end_time - start_time).count(); + } + + double image_size_in_mpixels = 0; + for(int b = 0; b < current_batch_size; b++) + { + image_size_in_mpixels += (static_cast(widths[b][0]) * + static_cast(heights[b][0]) / 1000000); + } + + decode_info.num_decoded_images += current_batch_size; + + if(save_images) + { + for(int b = 0; b < current_batch_size; b++) + { + std::string image_save_path = output_file_path; + // if ROI is present, need to pass roi_width and roi_height + uint32_t width = is_roi_valid ? roi_width : widths[b][0]; + uint32_t height = is_roi_valid ? roi_height : heights[b][0]; + rocjpeg_utils.GetOutputFileExt(decode_params.output_format, + base_file_names[b], width, height, + subsamplings[b], image_save_path); + rocjpeg_utils.SaveImage(image_save_path, &output_images[b], width, height, + subsamplings[b], decode_params.output_format); + } + } + + total_decode_time_in_milli_sec += time_per_batch_in_milli_sec; + image_size_in_mpixels_all += image_size_in_mpixels; + + current_batch_size = 0; + } + + double avg_time_per_image = + decode_info.num_decoded_images > 0 + ? total_decode_time_in_milli_sec / decode_info.num_decoded_images + : 0; + decode_info.images_per_sec = avg_time_per_image > 0 ? 1000 / avg_time_per_image : 0; + decode_info.image_size_in_mpixels_per_sec = decode_info.num_decoded_images > 0 + ? decode_info.images_per_sec * + image_size_in_mpixels_all / + decode_info.num_decoded_images + : 0; + + for(auto& it : output_images) + { + for(int i = 0; i < ROCJPEG_MAX_COMPONENT; i++) + { + if(it.channel[i] != nullptr) + { + CHECK_HIP(hipFree((void*) it.channel[i])); + it.channel[i] = nullptr; + } + } + } +} + +int +main(int argc, char** argv) +{ + int device_id = 0; + bool save_images = false; + int num_threads = 1; + int batch_size = 1; + bool is_dir = false; + bool is_file = false; + RocJpegBackend rocjpeg_backend = ROCJPEG_BACKEND_HARDWARE; + RocJpegDecodeParams decode_params = {}; + RocJpegUtils rocjpeg_utils; + std::string input_path, output_file_path; + std::vector file_paths = {}; + std::vector decode_info_per_thread; + + RocJpegUtils::ParseCommandLine(input_path, output_file_path, save_images, device_id, + rocjpeg_backend, decode_params, &num_threads, + &batch_size, argc, argv); + if(!RocJpegUtils::GetFilePaths(input_path, file_paths, is_dir, is_file)) + { + std::cerr << "ERROR: Failed to get input file paths!" << std::endl; + return EXIT_FAILURE; + } + if(!RocJpegUtils::InitHipDevice(device_id)) + { + std::cerr << "ERROR: Failed to initialize HIP!" << std::endl; + return EXIT_FAILURE; + } + + if(num_threads > file_paths.size()) + { + num_threads = file_paths.size(); + } + + decode_info_per_thread.resize(num_threads); + + for(int i = 0; i < num_threads; i++) + { + CHECK_ROCJPEG(rocJpegCreate(rocjpeg_backend, device_id, + &decode_info_per_thread[i].rocjpeg_handle)); + decode_info_per_thread[i].rocjpeg_stream_handles.resize(batch_size); + for(auto j = 0; j < batch_size; j++) + { + CHECK_ROCJPEG(rocJpegStreamCreate( + &decode_info_per_thread[i].rocjpeg_stream_handles[j])); + } + decode_info_per_thread[i].num_decoded_images = 0; + decode_info_per_thread[i].images_per_sec = 0; + decode_info_per_thread[i].image_size_in_mpixels_per_sec = 0; + decode_info_per_thread[i].num_bad_jpegs = 0; + decode_info_per_thread[i].num_jpegs_with_411_subsampling = 0; + decode_info_per_thread[i].num_jpegs_with_unknown_subsampling = 0; + decode_info_per_thread[i].num_jpegs_with_unsupported_resolution = 0; + } + + ThreadPool thread_pool(num_threads); + + size_t files_per_thread = file_paths.size() / num_threads; + size_t remaining_files = file_paths.size() % num_threads; + size_t start_index = 0; + for(int i = 0; i < num_threads; i++) + { + size_t end_index = start_index + files_per_thread + (i < remaining_files ? 1 : 0); + decode_info_per_thread[i].file_paths.assign(file_paths.begin() + start_index, + file_paths.begin() + end_index); + start_index = end_index; + } + + std::cout << "Decoding started with " << num_threads << " threads, please wait!" + << std::endl; + for(int i = 0; i < num_threads; ++i) + { + thread_pool.ExecuteJob( + std::bind(DecodeImages, std::ref(decode_info_per_thread[i]), rocjpeg_utils, + std::ref(decode_params), save_images, std::ref(output_file_path), + batch_size, device_id)); + } + thread_pool.JoinThreads(); + + uint64_t total_decoded_images = 0; + double total_images_per_sec = 0; + double total_image_size_in_mpixels_per_sec = 0; + uint64_t total_num_bad_jpegs = 0; + uint64_t total_num_jpegs_with_411_subsampling = 0; + uint64_t total_num_jpegs_with_unknown_subsampling = 0; + uint64_t total_num_jpegs_with_unsupported_resolution = 0; + + for(auto i = 0; i < num_threads; i++) + { + total_decoded_images += decode_info_per_thread[i].num_decoded_images; + total_image_size_in_mpixels_per_sec += + decode_info_per_thread[i].image_size_in_mpixels_per_sec; + total_images_per_sec += decode_info_per_thread[i].images_per_sec; + total_num_bad_jpegs += decode_info_per_thread[i].num_bad_jpegs; + total_num_jpegs_with_411_subsampling += + decode_info_per_thread[i].num_jpegs_with_411_subsampling; + total_num_jpegs_with_unknown_subsampling += + decode_info_per_thread[i].num_jpegs_with_unknown_subsampling; + total_num_jpegs_with_unsupported_resolution += + decode_info_per_thread[i].num_jpegs_with_unsupported_resolution; + } + + std::cout << "Total decoded images: " << total_decoded_images << std::endl; + if(total_num_bad_jpegs || total_num_jpegs_with_411_subsampling || + total_num_jpegs_with_unknown_subsampling || + total_num_jpegs_with_unsupported_resolution) + { + std::cout << "Total skipped images: " + << total_num_bad_jpegs + total_num_jpegs_with_411_subsampling + + total_num_jpegs_with_unknown_subsampling + + total_num_jpegs_with_unsupported_resolution; + if(total_num_bad_jpegs) + { + std::cout << " ,total images that cannot be parsed: " << total_num_bad_jpegs; + } + if(total_num_jpegs_with_411_subsampling) + { + std::cout << " ,total images with YUV 4:1:1 chroam subsampling: " + << total_num_jpegs_with_411_subsampling; + } + if(total_num_jpegs_with_unknown_subsampling) + { + std::cout << " ,total images with unknwon chroam subsampling: " + << total_num_jpegs_with_unknown_subsampling; + } + if(total_num_jpegs_with_unsupported_resolution) + { + std::cout << " ,total images with unsupported_resolution: " + << total_num_jpegs_with_unsupported_resolution; + } + std::cout << std::endl; + } + + if(total_decoded_images > 0) + { + std::cout << "Average processing time per image (ms): " + << 1000 / total_images_per_sec << std::endl; + std::cout << "Average decoded images per sec (Images/Sec): " + << total_images_per_sec << std::endl; + std::cout << "Average decoded images size (Mpixels/Sec): " + << total_image_size_in_mpixels_per_sec << std::endl; + } + + for(int i = 0; i < num_threads; i++) + { + CHECK_ROCJPEG(rocJpegDestroy(decode_info_per_thread[i].rocjpeg_handle)); + for(auto j = 0; j < batch_size; j++) + { + CHECK_ROCJPEG(rocJpegStreamDestroy( + decode_info_per_thread[i].rocjpeg_stream_handles[j])); + } + } + + std::cout << "Decoding completed!" << std::endl; + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/examples/jpegdecode/rocjpeg_samples_utils.h b/examples/jpegdecode/rocjpeg_samples_utils.h new file mode 100644 index 0000000000..1bad7af2dc --- /dev/null +++ b/examples/jpegdecode/rocjpeg_samples_utils.h @@ -0,0 +1,903 @@ +/* +Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef ROC_JPEG_SAMPLES_COMMON +#define ROC_JPEG_SAMPLES_COMMON +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if __cplusplus >= 201703L && __has_include() +# include +namespace fs = std::filesystem; +#else +# include +namespace fs = std::experimental::filesystem; +#endif +#include "rocjpeg.h" +#include + +#define CHECK_ROCJPEG(call) \ + { \ + RocJpegStatus rocjpeg_status = (call); \ + if(rocjpeg_status != ROCJPEG_STATUS_SUCCESS) \ + { \ + std::cerr << #call << " returned " << rocJpegGetErrorName(rocjpeg_status) \ + << " at " << __FILE__ << ":" << __LINE__ << std::endl; \ + exit(1); \ + } \ + } + +#define CHECK_HIP(call) \ + { \ + hipError_t hip_status = (call); \ + if(hip_status != hipSuccess) \ + { \ + std::cout << "rocJPEG failure: '#" << hip_status << "' at " << __FILE__ \ + << ":" << __LINE__ << std::endl; \ + exit(1); \ + } \ + } + +/** + * @class RocJpegUtils + * @brief Utility class for rocJPEG samples. + * + * This class provides utility functions for rocJPEG samples, such as parsing command line + * arguments, getting file paths, initializing HIP device, getting chroma subsampling + * string, getting channel pitch and sizes, getting output file extension, and saving + * images. + */ +class RocJpegUtils +{ +public: + /** + * @brief Parses the command line arguments. + * + * This function parses the command line arguments and sets the corresponding + * variables. + * + * @param input_path The input path. + * @param output_file_path The output file path. + * @param save_images Flag indicating whether to save images. + * @param device_id The device ID. + * @param rocjpeg_backend The rocJPEG backend. + * @param decode_params The rocJPEG decode parameters. + * @param num_threads The number of threads. + * @param crop The crop rectangle. + * @param argc The number of command line arguments. + * @param argv The command line arguments. + */ + static void ParseCommandLine(std::string& input_path, std::string& output_file_path, + bool& save_images, int& device_id, + RocJpegBackend& rocjpeg_backend, + RocJpegDecodeParams& decode_params, int* num_threads, + int* batch_size, int argc, char* argv[]) + { + if(argc <= 1) + { + ShowHelpAndExit("", num_threads != nullptr, batch_size != nullptr); + } + for(int i = 1; i < argc; i++) + { + if(!strcmp(argv[i], "-h")) + { + ShowHelpAndExit("", num_threads != nullptr, batch_size != nullptr); + } + if(!strcmp(argv[i], "-i")) + { + if(++i == argc) + { + ShowHelpAndExit("-i", num_threads != nullptr, batch_size != nullptr); + } + input_path = argv[i]; + continue; + } + if(!strcmp(argv[i], "-o")) + { + if(++i == argc) + { + ShowHelpAndExit("-o", num_threads != nullptr, batch_size != nullptr); + } + output_file_path = argv[i]; + save_images = true; + continue; + } + if(!strcmp(argv[i], "-d")) + { + if(++i == argc) + { + ShowHelpAndExit("-d", num_threads != nullptr, batch_size != nullptr); + } + device_id = atoi(argv[i]); + continue; + } + if(!strcmp(argv[i], "-be")) + { + if(++i == argc) + { + ShowHelpAndExit("-be", num_threads != nullptr, batch_size != nullptr); + } + rocjpeg_backend = static_cast(atoi(argv[i])); + continue; + } + if(!strcmp(argv[i], "-fmt")) + { + if(++i == argc) + { + ShowHelpAndExit("-fmt", num_threads != nullptr, + batch_size != nullptr); + } + std::string selected_output_format = argv[i]; + if(selected_output_format == "native") + { + decode_params.output_format = ROCJPEG_OUTPUT_NATIVE; + } + else if(selected_output_format == "yuv_planar") + { + decode_params.output_format = ROCJPEG_OUTPUT_YUV_PLANAR; + } + else if(selected_output_format == "y") + { + decode_params.output_format = ROCJPEG_OUTPUT_Y; + } + else if(selected_output_format == "rgb") + { + decode_params.output_format = ROCJPEG_OUTPUT_RGB; + } + else if(selected_output_format == "rgb_planar") + { + decode_params.output_format = ROCJPEG_OUTPUT_RGB_PLANAR; + } + else + { + ShowHelpAndExit(argv[i], num_threads != nullptr); + } + continue; + } + if(!strcmp(argv[i], "-t")) + { + if(++i == argc) + { + ShowHelpAndExit("-t", num_threads != nullptr, batch_size != nullptr); + } + if(num_threads != nullptr) + { + *num_threads = atoi(argv[i]); + if(*num_threads <= 0 || *num_threads > 32) + { + ShowHelpAndExit(argv[i], num_threads != nullptr, + batch_size != nullptr); + } + } + continue; + } + if(!strcmp(argv[i], "-b")) + { + if(++i == argc) + { + ShowHelpAndExit("-b", num_threads != nullptr, batch_size != nullptr); + } + if(batch_size != nullptr) *batch_size = atoi(argv[i]); + continue; + } + if(!strcmp(argv[i], "-crop")) + { + if(++i == argc || 4 != sscanf(argv[i], "%hd,%hd,%hd,%hd", + &decode_params.crop_rectangle.left, + &decode_params.crop_rectangle.top, + &decode_params.crop_rectangle.right, + &decode_params.crop_rectangle.bottom)) + { + ShowHelpAndExit("-crop"); + } + if((&decode_params.crop_rectangle.right - + &decode_params.crop_rectangle.left) % + 2 == + 1 || + (&decode_params.crop_rectangle.bottom - + &decode_params.crop_rectangle.top) % + 2 == + 1) + { + std::cout << "output crop rectangle must have width and height of " + "even numbers" + << std::endl; + exit(1); + } + continue; + } + ShowHelpAndExit(argv[i], num_threads != nullptr, batch_size != nullptr); + } + } + + /** + * Checks if a file is a JPEG file. + * + * @param filePath The path to the file to be checked. + * @return True if the file is a JPEG file, false otherwise. + */ + static bool IsJPEG(const std::string& filePath) + { + std::ifstream file(filePath, std::ios::binary); + if(!file.is_open()) + { + std::cerr << "Failed to open file: " << filePath << std::endl; + return false; + } + + unsigned char buffer[2]; + file.read(reinterpret_cast(buffer), 2); + file.close(); + + // The first two bytes of every JPEG stream are always 0xFFD8, which represents + // the Start of Image (SOI) marker. + return buffer[0] == 0xFF && buffer[1] == 0xD8; + } + + /** + * @brief Gets the file paths. + * + * This function gets the file paths based on the input path and sets the + * corresponding variables. + * + * @param input_path The input path. + * @param file_paths The vector to store the file paths. + * @param is_dir Flag indicating whether the input path is a directory. + * @param is_file Flag indicating whether the input path is a file. + * @return True if successful, false otherwise. + */ + static bool GetFilePaths(std::string& input_path, + std::vector& file_paths, bool& is_dir, + bool& is_file) + { + std::cout << "Reading images from disk, please wait!" << std::endl; + if(!fs::exists(input_path)) + { + std::cerr << "ERROR: the input path does not exist!" << std::endl; + return false; + } + is_dir = fs::is_directory(input_path); + is_file = fs::is_regular_file(input_path); + if(is_dir) + { + for(const auto& entry : fs::recursive_directory_iterator(input_path)) + { + if(fs::is_regular_file(entry) && IsJPEG(entry.path().string())) + { + file_paths.push_back(entry.path().string()); + } + } + } + else if(is_file && IsJPEG(input_path)) + { + file_paths.push_back(input_path); + } + else + { + std::cerr << "ERROR: the input path does not contain JPEG files!" + << std::endl; + return false; + } + return true; + } + + /** + * @brief Initializes the HIP device. + * + * This function initializes the HIP device with the specified device ID. + * + * @param device_id The device ID. + * @return True if successful, false otherwise. + */ + static bool InitHipDevice(int device_id) + { + int num_devices; + hipDeviceProp_t hip_dev_prop; + CHECK_HIP(hipGetDeviceCount(&num_devices)); + if(num_devices < 1) + { + std::cerr << "ERROR: didn't find any GPU!" << std::endl; + return false; + } + if(device_id >= num_devices) + { + std::cerr << "ERROR: the requested device_id is not found!" << std::endl; + return false; + } + CHECK_HIP(hipSetDevice(device_id)); + CHECK_HIP(hipGetDeviceProperties(&hip_dev_prop, device_id)); + + std::cout << "Using GPU device " << device_id << ": " << hip_dev_prop.name << "[" + << hip_dev_prop.gcnArchName << "] on PCI bus " << std::setfill('0') + << std::setw(2) << std::right << std::hex << hip_dev_prop.pciBusID + << ":" << std::setfill('0') << std::setw(2) << std::right << std::hex + << hip_dev_prop.pciDomainID << "." << hip_dev_prop.pciDeviceID + << std::dec << std::endl; + + return true; + } + + /** + * @brief Gets the chroma subsampling string. + * + * This function gets the chroma subsampling string based on the specified subsampling + * value. + * + * @param subsampling The chroma subsampling value. + * @param chroma_sub_sampling The string to store the chroma subsampling. + */ + void GetChromaSubsamplingStr(RocJpegChromaSubsampling subsampling, + std::string& chroma_sub_sampling) + { + switch(subsampling) + { + case ROCJPEG_CSS_444: chroma_sub_sampling = "YUV 4:4:4"; break; + case ROCJPEG_CSS_440: chroma_sub_sampling = "YUV 4:4:0"; break; + case ROCJPEG_CSS_422: chroma_sub_sampling = "YUV 4:2:2"; break; + case ROCJPEG_CSS_420: chroma_sub_sampling = "YUV 4:2:0"; break; + case ROCJPEG_CSS_411: chroma_sub_sampling = "YUV 4:1:1"; break; + case ROCJPEG_CSS_400: chroma_sub_sampling = "YUV 4:0:0"; break; + case ROCJPEG_CSS_UNKNOWN: chroma_sub_sampling = "UNKNOWN"; break; + default: chroma_sub_sampling = ""; break; + } + } + + /** + * @brief Gets the channel pitch and sizes. + * + * This function gets the channel pitch and sizes based on the specified output + * format, chroma subsampling, output image, and channel sizes. + * + * @param decode_params The decode parameters that specify the output format and crop + * rectangle. + * @param subsampling The chroma subsampling. + * @param widths The array to store the channel widths. + * @param heights The array to store the channel heights. + * @param num_channels The number of channels. + * @param output_image The output image. + * @param channel_sizes The array to store the channel sizes. + * @return The channel pitch. + */ + int GetChannelPitchAndSizes(RocJpegDecodeParams decode_params, + RocJpegChromaSubsampling subsampling, uint32_t* widths, + uint32_t* heights, uint32_t& num_channels, + RocJpegImage& output_image, uint32_t* channel_sizes) + { + bool is_roi_valid = false; + uint32_t roi_width; + uint32_t roi_height; + roi_width = + decode_params.crop_rectangle.right - decode_params.crop_rectangle.left; + roi_height = + decode_params.crop_rectangle.bottom - decode_params.crop_rectangle.top; + if(roi_width > 0 && roi_height > 0 && roi_width <= widths[0] && + roi_height <= heights[0]) + { + is_roi_valid = true; + } + switch(decode_params.output_format) + { + case ROCJPEG_OUTPUT_NATIVE: + switch(subsampling) + { + case ROCJPEG_CSS_444: + num_channels = 3; + output_image.pitch[2] = output_image.pitch[1] = + output_image.pitch[0] = is_roi_valid ? roi_width : widths[0]; + channel_sizes[2] = channel_sizes[1] = channel_sizes[0] = + align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + break; + case ROCJPEG_CSS_440: + num_channels = 3; + output_image.pitch[2] = output_image.pitch[1] = + output_image.pitch[0] = is_roi_valid ? roi_width : widths[0]; + channel_sizes[0] = + align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + channel_sizes[2] = channel_sizes[1] = + align(output_image.pitch[0] * + ((is_roi_valid ? roi_height : heights[0]) >> 1), + mem_alignment); + break; + case ROCJPEG_CSS_422: + num_channels = 1; + output_image.pitch[0] = + (is_roi_valid ? roi_width : widths[0]) * 2; + channel_sizes[0] = + align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + break; + case ROCJPEG_CSS_420: + num_channels = 2; + output_image.pitch[1] = output_image.pitch[0] = + is_roi_valid ? roi_width : widths[0]; + channel_sizes[0] = + align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + channel_sizes[1] = + align(output_image.pitch[1] * + ((is_roi_valid ? roi_height : heights[0]) >> 1), + mem_alignment); + break; + case ROCJPEG_CSS_400: + num_channels = 1; + output_image.pitch[0] = is_roi_valid ? roi_width : widths[0]; + channel_sizes[0] = + align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + break; + default: + std::cout << "Unknown chroma subsampling!" << std::endl; + return EXIT_FAILURE; + } + break; + case ROCJPEG_OUTPUT_YUV_PLANAR: + if(subsampling == ROCJPEG_CSS_400) + { + num_channels = 1; + output_image.pitch[0] = is_roi_valid ? roi_width : widths[0]; + channel_sizes[0] = align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + } + else + { + num_channels = 3; + output_image.pitch[0] = is_roi_valid ? roi_width : widths[0]; + output_image.pitch[1] = is_roi_valid ? roi_width : widths[1]; + output_image.pitch[2] = is_roi_valid ? roi_width : widths[2]; + channel_sizes[0] = align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + channel_sizes[1] = align(output_image.pitch[1] * + (is_roi_valid ? roi_height : heights[1]), + mem_alignment); + channel_sizes[2] = align(output_image.pitch[2] * + (is_roi_valid ? roi_height : heights[2]), + mem_alignment); + } + break; + case ROCJPEG_OUTPUT_Y: + num_channels = 1; + output_image.pitch[0] = is_roi_valid ? roi_width : widths[0]; + channel_sizes[0] = align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + break; + case ROCJPEG_OUTPUT_RGB: + num_channels = 1; + output_image.pitch[0] = (is_roi_valid ? roi_width : widths[0]) * 3; + channel_sizes[0] = align(output_image.pitch[0] * + (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + num_channels = 3; + output_image.pitch[2] = output_image.pitch[1] = output_image.pitch[0] = + is_roi_valid ? roi_width : widths[0]; + channel_sizes[2] = channel_sizes[1] = channel_sizes[0] = align( + output_image.pitch[0] * (is_roi_valid ? roi_height : heights[0]), + mem_alignment); + break; + default: + std::cout << "Unknown output format!" << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; + } + + /** + * @brief Gets the output file extension. + * + * This function gets the output file extension based on the specified output format, + * base file name, image width, image height, and file name for saving. + * + * @param output_format The output format. + * @param base_file_name The base file name. + * @param image_width The image width. + * @param image_height The image height. + * @param file_name_for_saving The string to store the file name for saving. + */ + void GetOutputFileExt(RocJpegOutputFormat output_format, std::string& base_file_name, + uint32_t image_width, uint32_t image_height, + RocJpegChromaSubsampling subsampling, + std::string& file_name_for_saving) + { + std::string file_extension; + std::string::size_type const p(base_file_name.find_last_of('.')); + std::string file_name_no_ext = base_file_name.substr(0, p); + std::string format_description = ""; + switch(output_format) + { + case ROCJPEG_OUTPUT_NATIVE: + file_extension = "yuv"; + switch(subsampling) + { + case ROCJPEG_CSS_444: format_description = "444"; break; + case ROCJPEG_CSS_440: format_description = "440"; break; + case ROCJPEG_CSS_422: format_description = "422_yuyv"; break; + case ROCJPEG_CSS_420: format_description = "nv12"; break; + case ROCJPEG_CSS_400: format_description = "400"; break; + default: + std::cout << "Unknown chroma subsampling!" << std::endl; + return; + } + break; + case ROCJPEG_OUTPUT_YUV_PLANAR: + file_extension = "yuv"; + format_description = "planar"; + break; + case ROCJPEG_OUTPUT_Y: + file_extension = "yuv"; + format_description = "400"; + break; + case ROCJPEG_OUTPUT_RGB: + file_extension = "rgb"; + format_description = "packed"; + break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + file_extension = "rgb"; + format_description = "planar"; + break; + default: file_extension = ""; break; + } + file_name_for_saving += "//" + file_name_no_ext + "_" + + std::to_string(image_width) + "x" + + std::to_string(image_height) + "_" + format_description + + "." + file_extension; + } + + /** + * @brief Saves the image. + * + * This function saves the image to the specified output file name based on the output + * image, image width, image height, chroma subsampling, and output format. + * + * @param output_file_name The output file name. + * @param output_image The output image. + * @param img_width The image width. + * @param img_height The image height. + * @param subsampling The chroma subsampling. + * @param output_format The output format. + */ + void SaveImage(std::string output_file_name, RocJpegImage* output_image, + uint32_t img_width, uint32_t img_height, + RocJpegChromaSubsampling subsampling, + RocJpegOutputFormat output_format) + { + uint8_t* hst_ptr = nullptr; + FILE* fp; + hipError_t hip_status = hipSuccess; + + if(output_image == nullptr || output_image->channel[0] == nullptr || + output_image->pitch[0] == 0) + { + return; + } + + uint32_t widths[ROCJPEG_MAX_COMPONENT] = {}; + uint32_t heights[ROCJPEG_MAX_COMPONENT] = {}; + + switch(output_format) + { + case ROCJPEG_OUTPUT_NATIVE: + switch(subsampling) + { + case ROCJPEG_CSS_444: + widths[2] = widths[1] = widths[0] = img_width; + heights[2] = heights[1] = heights[0] = img_height; + break; + case ROCJPEG_CSS_440: + widths[2] = widths[1] = widths[0] = img_width; + heights[0] = img_height; + heights[2] = heights[1] = img_height >> 1; + break; + case ROCJPEG_CSS_422: + widths[0] = img_width * 2; + heights[0] = img_height; + break; + case ROCJPEG_CSS_420: + widths[1] = widths[0] = img_width; + heights[0] = img_height; + heights[1] = img_height >> 1; + break; + case ROCJPEG_CSS_400: + widths[0] = img_width; + heights[0] = img_height; + break; + default: + std::cout << "Unknown chroma subsampling!" << std::endl; + return; + } + break; + case ROCJPEG_OUTPUT_YUV_PLANAR: + switch(subsampling) + { + case ROCJPEG_CSS_444: + widths[2] = widths[1] = widths[0] = img_width; + heights[2] = heights[1] = heights[0] = img_height; + break; + case ROCJPEG_CSS_440: + widths[2] = widths[1] = widths[0] = img_width; + heights[0] = img_height; + heights[2] = heights[1] = img_height >> 1; + break; + case ROCJPEG_CSS_422: + widths[0] = img_width; + widths[2] = widths[1] = widths[0] >> 1; + heights[2] = heights[1] = heights[0] = img_height; + break; + case ROCJPEG_CSS_420: + widths[0] = img_width; + widths[2] = widths[1] = widths[0] >> 1; + heights[0] = img_height; + heights[2] = heights[1] = img_height >> 1; + break; + case ROCJPEG_CSS_400: + widths[0] = img_width; + heights[0] = img_height; + break; + default: + std::cout << "Unknown chroma subsampling!" << std::endl; + return; + } + break; + case ROCJPEG_OUTPUT_Y: + widths[0] = img_width; + heights[0] = img_height; + break; + case ROCJPEG_OUTPUT_RGB: + widths[0] = img_width * 3; + heights[0] = img_height; + break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + widths[2] = widths[1] = widths[0] = img_width; + heights[2] = heights[1] = heights[0] = img_height; + break; + default: std::cout << "Unknown output format!" << std::endl; return; + } + + uint32_t channel0_size = output_image->pitch[0] * heights[0]; + uint32_t channel1_size = output_image->pitch[1] * heights[1]; + uint32_t channel2_size = output_image->pitch[2] * heights[2]; + + uint32_t output_image_size = channel0_size + channel1_size + channel2_size; + + if(hst_ptr == nullptr) + { + hst_ptr = new uint8_t[output_image_size]; + } + + CHECK_HIP( + hipMemcpyDtoH((void*) hst_ptr, output_image->channel[0], channel0_size)); + + uint8_t* tmp_hst_ptr = hst_ptr; + fp = fopen(output_file_name.c_str(), "wb"); + if(fp) + { + // write channel0 + if(widths[0] == output_image->pitch[0]) + { + fwrite(hst_ptr, 1, channel0_size, fp); + } + else + { + for(int i = 0; i < heights[0]; i++) + { + fwrite(tmp_hst_ptr, 1, widths[0], fp); + tmp_hst_ptr += output_image->pitch[0]; + } + } + // write channel1 + if(channel1_size != 0 && output_image->channel[1] != nullptr) + { + uint8_t* channel1_hst_ptr = hst_ptr + channel0_size; + CHECK_HIP(hipMemcpyDtoH((void*) channel1_hst_ptr, + output_image->channel[1], channel1_size)); + if(widths[1] == output_image->pitch[1]) + { + fwrite(channel1_hst_ptr, 1, channel1_size, fp); + } + else + { + for(int i = 0; i < heights[1]; i++) + { + fwrite(channel1_hst_ptr, 1, widths[1], fp); + channel1_hst_ptr += output_image->pitch[1]; + } + } + } + // write channel2 + if(channel2_size != 0 && output_image->channel[2] != nullptr) + { + uint8_t* channel2_hst_ptr = hst_ptr + channel0_size + channel1_size; + CHECK_HIP(hipMemcpyDtoH((void*) channel2_hst_ptr, + output_image->channel[2], channel2_size)); + if(widths[2] == output_image->pitch[2]) + { + fwrite(channel2_hst_ptr, 1, channel2_size, fp); + } + else + { + for(int i = 0; i < heights[2]; i++) + { + fwrite(channel2_hst_ptr, 1, widths[2], fp); + channel2_hst_ptr += output_image->pitch[2]; + } + } + } + fclose(fp); + } + + if(hst_ptr != nullptr) + { + delete[] hst_ptr; + hst_ptr = nullptr; + tmp_hst_ptr = nullptr; + } + } + +private: + static const int mem_alignment = 4 * 1024 * 1024; + /** + * @brief Shows the help message and exits. + * + * This function shows the help message and exits the program. + * + * @param option The option to display in the help message (optional). + * @param show_threads Flag indicating whether to show the number of threads in the + * help message. + */ + static void ShowHelpAndExit(const char* option = nullptr, bool show_threads = false, + bool show_batch_size = false) + { + std::cout << "Options:\n" + "-i [input path] - input path to a single JPEG image or a " + "directory containing JPEG images - [required]\n" + "-be [backend] - select rocJPEG backend (0 for " + "hardware-accelerated JPEG decoding using VCN,\n" + " 1 for hybrid JPEG " + "decoding using CPU and GPU HIP kernels (currently not supported)) " + "[optional - default: 0]\n" + "-fmt [output format] - select rocJPEG output format for " + "decoding, one of the [native, yuv_planar, y, rgb, rgb_planar] - " + "[optional - default: native]\n" + "-o [output path] - path to an output file or a path to an " + "existing directory - write decoded images to a file or an existing " + "directory based on selected output format - [optional]\n" + "-crop [crop rectangle] - crop rectangle for output in a " + "comma-separated format: left,top,right,bottom - [optional]\n" + "-d [device id] - specify the GPU device id for the desired " + "device (use 0 for the first device, 1 for the second device, and " + "so on) [optional - default: 0]\n"; + if(show_threads) + { + std::cout << "-t [threads] - number of threads (<= 32) for parallel JPEG " + "decoding - [optional - default: 1]\n"; + } + if(show_batch_size) + { + std::cout << "-b [batch_size] - decode images from input by batches of a " + "specified size - [optional - default: 1]\n"; + } + exit(0); + } + /** + * @brief Aligns a value to a specified alignment. + * + * This function takes a value and aligns it to the specified alignment. It returns + * the aligned value. + * + * @param value The value to be aligned. + * @param alignment The alignment value. + * @return The aligned value. + */ + static inline int align(int value, int alignment) + { + return (value + alignment - 1) & ~(alignment - 1); + } +}; + +class ThreadPool +{ +public: + ThreadPool(int nthreads) + : shutdown_(false) + { + // Create the specified number of threads + threads_.reserve(nthreads); + for(int i = 0; i < nthreads; ++i) + threads_.emplace_back(std::bind(&ThreadPool::ThreadEntry, this, i)); + } + + ~ThreadPool() {} + + void JoinThreads() + { + { + // Unblock any threads and tell them to stop + std::unique_lock lock(mutex_); + shutdown_ = true; + cond_var_.notify_all(); + } + + // Wait for all threads to stop + for(auto& thread : threads_) + thread.join(); + } + + void ExecuteJob(std::function func) + { + // Place a job on the queue and unblock a thread + std::unique_lock lock(mutex_); + decode_jobs_queue_.emplace(std::move(func)); + cond_var_.notify_one(); + } + +protected: + void ThreadEntry(int i) + { + std::function execute_decode_job; + + while(true) + { + { + std::unique_lock lock(mutex_); + cond_var_.wait(lock, + [&] { return shutdown_ || !decode_jobs_queue_.empty(); }); + if(decode_jobs_queue_.empty()) + { + // No jobs to do; shutting down + return; + } + + execute_decode_job = std::move(decode_jobs_queue_.front()); + decode_jobs_queue_.pop(); + } + + // Execute the decode job without holding any locks + execute_decode_job(); + } + } + + std::mutex mutex_; + std::condition_variable cond_var_; + bool shutdown_; + std::queue> decode_jobs_queue_; + std::vector threads_; +}; +#endif // ROC_JPEG_SAMPLES_COMMON \ No newline at end of file diff --git a/examples/videodecode/CMakeLists.txt b/examples/videodecode/CMakeLists.txt index c2d9fb8345..fee5048610 100644 --- a/examples/videodecode/CMakeLists.txt +++ b/examples/videodecode/CMakeLists.txt @@ -170,9 +170,9 @@ else() "-- ERROR!: videodecode excluded! please install all the dependencies and try again!" ) if(NOT FFMPEG_FOUND) - message(FATAL_ERROR "-- ERROR!: FFMPEG Not Found! - please install FFMPEG!") + message(WARNING "-- ERROR!: FFMPEG Not Found! - please install FFMPEG!") endif() if(NOT ROCDECODE_FOUND) - message(FATAL_ERROR "-- ERROR!: rocDecode Not Found! - please install rocDecode!") + message(WARNING "-- ERROR!: rocDecode Not Found! - please install rocDecode!") endif() endif() diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index bb54cdf661..2707378457 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -106,6 +106,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_vcn_activity, ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_jpeg_activity, ROCPROFSYS_CATEGORY_ROCM_SMI_JPEG_ACTIVITY, "device_jpeg_activity", "JPEG Activity of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm ROCDecode API") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions") ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions") @@ -171,6 +172,7 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_vcn_activity), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_jpeg_activity), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocdecode_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \ ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \ diff --git a/source/lib/core/components/fwd.hpp b/source/lib/core/components/fwd.hpp index 8259378ff6..9ad0847cb1 100644 --- a/source/lib/core/components/fwd.hpp +++ b/source/lib/core/components/fwd.hpp @@ -82,6 +82,8 @@ struct backtrace_gpu_memory {}; struct backtrace_gpu_vcn {}; +struct backtrace_gpu_jpeg +{}; using sampling_wall_clock = data_tracker; using sampling_cpu_clock = data_tracker; using sampling_percent = data_tracker; @@ -90,6 +92,7 @@ using sampling_gpu_temp = data_tracker; using sampling_gpu_power = data_tracker; using sampling_gpu_memory = data_tracker; using sampling_gpu_vcn = data_tracker; +using sampling_gpu_jpeg = data_tracker; template @@ -123,6 +126,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, fal ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_memory, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_vcn, false_type) +ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_jpeg, false_type) #endif TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_wall_clock, @@ -151,6 +155,9 @@ TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_temp, project::ro TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_vcn, project::rocprofsys, tpls::rocm, device::gpu, os::supports_linux, category::sampling, category::process_sampling) +TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_jpeg, project::rocprofsys, + tpls::rocm, device::gpu, os::supports_linux, + category::sampling, category::process_sampling) TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_wall_clock, "sampling_wall_clock", "Wall-clock timing", @@ -179,6 +186,10 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn, "sampling_gpu_vcn", "GPU VCN Utilization (% activity) via ROCm-SMI", "Derived from sampling") +TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_jpeg, + "sampling_gpu_jpeg", + "GPU JPEG Utilization (% activity) via ROCm-SMI", + "Derived from sampling") // statistics type TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_wall_clock, double) @@ -188,6 +199,7 @@ TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_temp, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_power, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_memory, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_vcn, double) +TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_jpeg, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::comm_data_tracker_t, float) // enable timing units @@ -220,6 +232,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_temp, false ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_power, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_memory, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_vcn, false_type) +ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_jpeg, false_type) // reporting categories (mean) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_type) diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index eb00891aef..f77bc502e8 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -315,10 +315,10 @@ configure_settings(bool _init) "Enable ROCm API and kernel tracing", true, "backend", "rocm"); - ROCPROFSYS_CONFIG_SETTING( - bool, "ROCPROFSYS_USE_ROCM_SMI", - "Enable sampling GPU power, temp, utilization, vcn_activity and memory usage", - true, "backend", "rocm_smi", "rocm", "process_sampling"); + ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCM_SMI", + "Enable sampling GPU power, temp, utilization, " + "vcn_activity, jpeg_activity and memory usage", + true, "backend", "rocm_smi", "rocm", "process_sampling"); ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_SAMPLING", "Enable statistical sampling of call-stack", false, @@ -626,11 +626,11 @@ configure_settings(bool _init) rocprofiler_sdk::config_settings(_config); - ROCPROFSYS_CONFIG_SETTING( - std::string, "ROCPROFSYS_ROCM_SMI_METRICS", - "rocm-smi metrics to collect: busy, temp, power, vcn_activity, mem_usage", - "busy,temp,power,vcn_activity,mem_usage", "backend", "rocm_smi", "rocm", - "process_sampling", "advanced"); + ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS", + "rocm-smi metrics to collect: busy, temp, power, " + "vcn_activity, jpeg_activity, mem_usage", + "busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm", + "process_sampling", "advanced"); ROCPROFSYS_CONFIG_SETTING(size_t, "ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB", "Hint for shared-memory buffer size in perfetto (in KB)", diff --git a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index 09b16e65e1..bba92e8338 100644 --- a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -57,8 +57,9 @@ extern "C" ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, - ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, + ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, + ROCPROFSYS_CATEGORY_ROCM_SMI_JPEG_ACTIVITY, ROCPROFSYS_CATEGORY_ROCM_RCCL, ROCPROFSYS_CATEGORY_SAMPLING, ROCPROFSYS_CATEGORY_PTHREAD, diff --git a/source/lib/rocprof-sys/library/rocm_smi.cpp b/source/lib/rocprof-sys/library/rocm_smi.cpp index bdf5fc46c6..1d8bac5073 100644 --- a/source/lib/rocprof-sys/library/rocm_smi.cpp +++ b/source/lib/rocprof-sys/library/rocm_smi.cpp @@ -160,12 +160,15 @@ data::sample(uint32_t _dev_id) &m_power, &power_type) ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get, _dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage); - ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).vcn_activity, - rsmi_dev_gpu_metrics_info_get, _dev_id, &_gpu_metrics); + ROCPROFSYS_ROCM_SMI_CALL(rsmi_dev_gpu_metrics_info_get(_dev_id, &_gpu_metrics)); - for(const auto& activity : _gpu_metrics.vcn_activity) + for(const auto& v_activity : _gpu_metrics.vcn_activity) { - if(activity != UINT16_MAX) m_vcn_metrics.push_back(activity); + if(v_activity != UINT16_MAX) m_vcn_metrics[_dev_id].push_back(v_activity); + } + for(const auto& j_activity : _gpu_metrics.jpeg_activity) + { + if(j_activity != UINT16_MAX) m_jpeg_metrics[_dev_id].push_back(j_activity); } #undef ROCPROFSYS_RSMI_GET @@ -262,6 +265,7 @@ void data::post_process(uint32_t _dev_id) { using component::sampling_gpu_busy; + using component::sampling_gpu_jpeg; using component::sampling_gpu_memory; using component::sampling_gpu_power; using component::sampling_gpu_temp; @@ -282,7 +286,7 @@ data::post_process(uint32_t _dev_id) auto _settings = get_settings(_dev_id); auto _process_perfetto = [&]() { - auto _idx = std::array{}; + auto _idx = std::array{}; { _idx.fill(_idx.size()); uint64_t nidx = 0; @@ -291,6 +295,7 @@ data::post_process(uint32_t _dev_id) if(_settings.power) _idx.at(2) = nidx++; if(_settings.mem_usage) _idx.at(3) = nidx++; if(_settings.vcn_activity) _idx.at(4) = nidx++; + if(_settings.jpeg_activity) _idx.at(5) = nidx++; } for(auto& itr : _rocm_smi) @@ -302,6 +307,18 @@ data::post_process(uint32_t _dev_id) auto addendum = [&](const char* _v) { return JOIN(" ", "GPU", _v, JOIN("", '[', _dev_id, ']'), "(S)"); }; + auto addendum_blk = [&](std::size_t _i, const char* _metric) { + if(_i < 10) + { + return JOIN(" ", "GPU", JOIN("", '[', _dev_id, ']'), _metric, + JOIN("", "[0", _i, ']'), "(S)"); + } + else + { + return JOIN(" ", "GPU", JOIN("", '[', _dev_id, ']'), _metric, + JOIN("", '[', _i, ']'), "(S)"); + } + }; if(_settings.busy) counter_track::emplace(_dev_id, addendum("Busy"), "%"); if(_settings.temp) @@ -313,11 +330,25 @@ data::post_process(uint32_t _dev_id) "megabytes"); if(_settings.vcn_activity) { - for(std::size_t i = 0; i < std::size(itr.m_vcn_metrics); ++i) - counter_track::emplace( - _dev_id, - addendum(("VCN Activity on " + std::to_string(i)).c_str()), - "%"); + for(const auto& [dev_id, metrics] : itr.m_vcn_metrics) + { + for(std::size_t i = 0; i < std::size(metrics); ++i) + { + counter_track::emplace( + _dev_id, addendum_blk(i, " VCN Activity"), "%"); + } + } + } + if(_settings.jpeg_activity) + { + for(const auto& [dev_id, metrics] : itr.m_jpeg_metrics) + { + for(std::size_t i = 0; i < std::size(metrics); ++i) + { + counter_track::emplace(_dev_id, + addendum_blk(i, "JPEG Activity"), "%"); + } + } } } uint64_t _ts = itr.m_ts; @@ -342,12 +373,28 @@ data::post_process(uint32_t _dev_id) counter_track::at(_dev_id, _idx.at(3)), _ts, _usage); if(_settings.vcn_activity) { - uint64_t idx = _idx.at(4); - for(const auto& temp : itr.m_vcn_metrics) + for(const auto& [dev_id, metrics] : itr.m_vcn_metrics) { - TRACE_COUNTER("device_vcn_activity", counter_track::at(_dev_id, idx), - _ts, temp); - ++idx; + for(std::size_t i = 0; i < std::size(metrics); ++i) + { + double _vcn_activity = metrics[i]; + TRACE_COUNTER("device_vcn_activity", + counter_track::at(_dev_id, _idx.at(4) + i), _ts, + _vcn_activity); + } + } + } + if(_settings.jpeg_activity) + { + for(const auto& [dev_id, metrics] : itr.m_jpeg_metrics) + { + for(std::size_t i = 0; i < std::size(metrics); ++i) + { + double _jpeg_activity = metrics[i]; + TRACE_COUNTER("device_jpeg_activity", + counter_track::at(_dev_id, _idx.at(5) + i), _ts, + _jpeg_activity); + } } } } @@ -440,9 +487,10 @@ setup() key_pair_t{ "power", get_settings(dev_id).power }, key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage }, key_pair_t{ "vcn_activity", get_settings(dev_id).vcn_activity }, + key_pair_t{ "jpeg_activity", get_settings(dev_id).jpeg_activity }, }; - get_settings(dev_id) = { false, false, false, false }; + get_settings(dev_id) = { false, false, false, false, false, false }; for(const auto& metric : tim::delimit(*_metrics, ",;:\t\n ")) { auto iitr = supported.find(metric); @@ -524,3 +572,7 @@ ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) diff --git a/source/lib/rocprof-sys/library/rocm_smi.hpp b/source/lib/rocprof-sys/library/rocm_smi.hpp index 133e60fe51..39d50ae9e1 100644 --- a/source/lib/rocprof-sys/library/rocm_smi.hpp +++ b/source/lib/rocprof-sys/library/rocm_smi.hpp @@ -71,11 +71,12 @@ device_count(); struct settings { - bool busy = true; - bool temp = true; - bool power = true; - bool mem_usage = true; - bool vcn_activity = true; + bool busy = true; + bool temp = true; + bool power = true; + bool mem_usage = true; + bool vcn_activity = true; + bool jpeg_activity = true; }; struct data @@ -100,13 +101,14 @@ struct data static void post_process(uint32_t _dev_id); - uint32_t m_dev_id = std::numeric_limits::max(); - timestamp_t m_ts = 0; - busy_perc_t m_busy_perc = 0; - temp_t m_temp = 0; - power_t m_power = 0; - mem_usage_t m_mem_usage = 0; - std::vector m_vcn_metrics = {}; + uint32_t m_dev_id = std::numeric_limits::max(); + timestamp_t m_ts = 0; + busy_perc_t m_busy_perc = 0; + temp_t m_temp = 0; + power_t m_power = 0; + mem_usage_t m_mem_usage = 0; + std::unordered_map> m_vcn_metrics = {}; + std::unordered_map> m_jpeg_metrics = {}; friend std::ostream& operator<<(std::ostream& _os, const data& _v) { @@ -185,5 +187,9 @@ ROCPROFSYS_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + # endif #endif diff --git a/source/lib/rocprof-sys/library/sampling.cpp b/source/lib/rocprof-sys/library/sampling.cpp index 0701e5fe42..f87987b362 100644 --- a/source/lib/rocprof-sys/library/sampling.cpp +++ b/source/lib/rocprof-sys/library/sampling.cpp @@ -126,6 +126,7 @@ using component::backtrace_wall_clock; // NOLINT using component::callchain; using component::sampling_cpu_clock; using component::sampling_gpu_busy; +using component::sampling_gpu_jpeg; using component::sampling_gpu_memory; using component::sampling_gpu_power; using component::sampling_gpu_temp; @@ -1575,10 +1576,16 @@ struct sampling_initialization sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags()); sampling_gpu_vcn::label() = "sampling_gpu_vcn_percent"; - sampling_gpu_vcn::description() = "Utilization of VCN(s)"; + sampling_gpu_vcn::description() = "VCN instance(s) activity"; sampling_gpu_vcn::set_precision(0); sampling_gpu_vcn::set_format_flags(sampling_gpu_vcn::get_format_flags() & std::ios_base::showpoint); + + sampling_gpu_jpeg::label() = "sampling_gpu_jpeg_percent"; + sampling_gpu_jpeg::description() = "JPEG instance(s) activity"; + sampling_gpu_jpeg::set_precision(0); + sampling_gpu_jpeg::set_format_flags(sampling_gpu_jpeg::get_format_flags() & + std::ios_base::showpoint); } }; } // namespace diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6272de846f..458b251842 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,6 +23,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-overflow-tests.cmake) include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-annotate-tests.cmake) include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-causal-tests.cmake) include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-python-tests.cmake) -include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-videodecode-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-decode-tests.cmake) add_subdirectory(source) diff --git a/tests/rocprof-sys-decode-tests.cmake b/tests/rocprof-sys-decode-tests.cmake new file mode 100644 index 0000000000..3945ce22bf --- /dev/null +++ b/tests/rocprof-sys-decode-tests.cmake @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------- # +# +# video decode tests +# +# -------------------------------------------------------------------------------------- # + +set(_decode_environment + "${_base_environment}" + "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy,rocdecode_api" + "ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,vcn_activity,jpeg_activity,mem_usage" + "ROCPROFSYS_SAMPLING_CPUS=none") + +rocprofiler_systems_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE + NAME video-decode + TARGET videodecode + GPU ON + ENVIRONMENT "${_decode_environment}" + RUN_ARGS -i ${PROJECT_BINARY_DIR}/videos -t 1 + LABELS "decode") + +rocprofiler_systems_add_validation_test( + NAME video-decode-sampling + PERFETTO_METRIC "rocm_rocdecode_api" + PERFETTO_FILE "perfetto-trace.proto" + LABELS "decode" + ARGS -l rocDecCreateVideoParser -c 2 -d 1 --counter-names "VCN Activity") + +# -------------------------------------------------------------------------------------- # +# +# image decode tests +# +# -------------------------------------------------------------------------------------- # + +rocprofiler_systems_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE + NAME image-decode + TARGET jpegdecode + GPU ON + ENVIRONMENT "${_decode_environment}" + RUN_ARGS -i ${PROJECT_BINARY_DIR}/images -b 32 + LABELS "decode") + +rocprofiler_systems_add_validation_test( + NAME image-decode-sampling + PERFETTO_METRIC "host" + PERFETTO_FILE "perfetto-trace.proto" + LABELS "decode" + ARGS -l jpegdecode -c 1 -d 0 --counter-names "JPEG Activity") diff --git a/tests/rocprof-sys-videodecode-tests.cmake b/tests/rocprof-sys-videodecode-tests.cmake deleted file mode 100644 index 177ce004ca..0000000000 --- a/tests/rocprof-sys-videodecode-tests.cmake +++ /dev/null @@ -1,22 +0,0 @@ -# -------------------------------------------------------------------------------------- # -# -# video decode tests -# -# -------------------------------------------------------------------------------------- # - -rocprofiler_systems_add_test( - SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE - NAME videodecode - TARGET videodecode - GPU ON - ENVIRONMENT - "${_base_environment};ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy,rocdecode_api" - RUN_ARGS -i ${PROJECT_BINARY_DIR}/videos -t 1 - LABELS "videodecode") - -rocprofiler_systems_add_validation_test( - NAME videodecode-sampling - PERFETTO_METRIC "rocm_rocdecode_api" - PERFETTO_FILE "perfetto-trace.proto" - LABELS "videodecode" - ARGS -l rocDecCreateVideoParser -c 2 -d 1 --counter-names "GPU VCN Activity") diff --git a/tests/validate-perfetto-proto.py b/tests/validate-perfetto-proto.py index e31d0cd1ef..4a173c9890 100755 --- a/tests/validate-perfetto-proto.py +++ b/tests/validate-perfetto-proto.py @@ -186,7 +186,7 @@ if __name__ == "__main__": sum_counter_values = tp.query( f"""SELECT SUM(counter.value) AS total_value FROM counter_track JOIN counter ON counter.track_id = counter_track.id WHERE counter_track.name LIKE - '{counter_name}%'""" + '%{counter_name}%'""" ) total_value = 0 for row in sum_counter_values: