diff --git a/samples/videoDecodeRGB/CMakeLists.txt b/samples/videoDecodeRGB/CMakeLists.txt new file mode 100644 index 0000000000..a30908cb7b --- /dev/null +++ b/samples/videoDecodeRGB/CMakeLists.txt @@ -0,0 +1,82 @@ +################################################################################ +# Copyright (c) 2023 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +cmake_minimum_required (VERSION 3.5) +project(videodecodergb) +set(CMAKE_CXX_STANDARD 17) + +# ROCM Path +if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Default ROCm installation path") +elseif(ROCM_PATH) + message("-- INFO:ROCM_PATH Set -- ${ROCM_PATH}") +else() + set(ROCM_PATH /opt/rocm CACHE PATH "Default ROCm installation path") +endif() + +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake) +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH}) +set(CMAKE_CXX_COMPILER ${ROCM_PATH}/llvm/bin/clang++) + +set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx940;gfx1030;gfx1031;gfx1032;gfx1100") +set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") +find_package(HIP QUIET) +find_package(FFmpeg QUIET) +# find rocDecode +find_library(ROCDECODE_LIBRARY NAMES rocdecode HINTS {ROCM_PATH}/lib) +find_path(ROCDECODE_INCLUDE_DIR NAMES rocdecode.h PATHS /opt/rocm/include/rocdecode {ROCM_PATH}/include/rocdecode) + +if(ROCDECODE_LIBRARY AND ROCDECODE_INCLUDE_DIR) + set(ROCDECODE_FOUND TRUE) + message("-- ${White}Using rocDecode -- \n\tLibraries:${ROCDECODE_LIBRARY} \n\tIncludes:${ROCDECODE_INCLUDE_DIR}${ColourReset}") +endif() + +if(HIP_FOUND AND FFMPEG_FOUND AND ROCDECODE_FOUND) + # HIP + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} hip::device) + # FFMPEG + include_directories(${AVUTIL_INCLUDE_DIR} ${AVCODEC_INCLUDE_DIR}) + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} ${FFMPEG_LIBRARIES}) + # rocDecode and utils + include_directories (${CMAKE_CURRENT_SOURCE_DIR}/../../utils ${ROCDECODE_INCLUDE_DIR}) + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} ${ROCDECODE_LIBRARY}) + list(APPEND SOURCES ${PROJECT_SOURCE_DIR} + videodecrgb.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/rocvideodecode/roc_video_dec.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/colorspace_kernels.cpp) + + add_executable(${PROJECT_NAME} ${SOURCES}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") + target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST}) +else() + message("-- ERROR!: ${PROJECT_NAME} excluded! please install all the dependencies and try again!") + if (NOT HIP_FOUND) + message(FATAL_ERROR "-- ERROR!: HIP Not Found! - please install ROCm and HIP!") + endif() + if (NOT FFMPEG_FOUND) + message(FATAL_ERROR "-- ERROR!: FFMPEG Not Found! - please install FFMPEG!") + endif() + if (NOT ROCDECODE_FOUND) + message(FATAL_ERROR "-- ERROR!: rocDecode Not Found! - please install rocDecode!") + endif() +endif() diff --git a/samples/videoDecodeRGB/README.md b/samples/videoDecodeRGB/README.md new file mode 100644 index 0000000000..db2efb433e --- /dev/null +++ b/samples/videoDecodeRGB/README.md @@ -0,0 +1,11 @@ +# Video Decode Sample +This sample illustrates the FFMPEG demuxer to get the individual frames which are then decoded using rocDecode API and optionally color-converted using custom HIP kernels on AMD hardware. This sample converts decoded YUV output to one of the RGB or BGR formats(24bit, 32bit, 464bit) + +## Build and run the sample: +``` +mkdir build +cd build +cmake .. +make -j +./videodecodergb -i -o -d -of +``` \ No newline at end of file diff --git a/samples/videoDecodeRGB/videodecrgb.cpp b/samples/videoDecodeRGB/videodecrgb.cpp new file mode 100644 index 0000000000..41a1cef2a5 --- /dev/null +++ b/samples/videoDecodeRGB/videodecrgb.cpp @@ -0,0 +1,325 @@ +/* +Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#if __cplusplus >= 201703L && __has_include() + #include +#else + #include +#endif +#include "video_demuxer.h" +#include "roc_video_dec.h" +#include "colorspace_kernels.h" + +FILE *fpOut = nullptr; +enum OutputFormatEnum { + native = 0, bgr, bgr48, rgb, rgb48, bgra, bgra64, rgba, rgba64 +}; +std::vector st_output_format_name = {"native", "bgr", "bgr48", "rgb", "rgb48", "bgra", "bgra64", "rgba", "rgba64"}; + +void ShowHelpAndExit(const char *option = NULL) { + std::cout << "Options:" << std::endl + << "-i Input File Path - required" << std::endl + << "-o Output File Path - dumps output if requested; optional" << std::endl + << "-d GPU device ID (0 for the first device, 1 for the second, etc.); optional; default: 0" << std::endl + << "-of Output Format name - (native, bgr, bgr48, rgb, rgb48, bgra, bgra64, rgba, rgba64; converts native YUV frame to RGB image format; optional; default: 0" << std::endl + << "-crop crop rectangle for output (not used when using interopped decoded frame); optional; default: 0" << std::endl + << "-m output_surface_memory_type - decoded surface memory; optional; default - 0" << std::endl; + + exit(0); +} + +void DumpRGBImage(std::string outputfileName, void* pdevMem, OutputSurfaceInfo *surf_info, int rgb_image_size) { + if (fpOut == nullptr) { + fpOut = fopen(outputfileName.c_str(), "wb"); + } + uint8_t *hstPtr = nullptr; + hstPtr = new uint8_t [rgb_image_size]; + hipError_t hip_status = hipSuccess; + hip_status = hipMemcpyDtoH((void *)hstPtr, pdevMem, rgb_image_size); + if (hip_status != hipSuccess) { + std::cout << "ERROR: hipMemcpyDtoH failed! (" << hip_status << ")" << std::endl; + delete [] hstPtr; + return; + } + if (fpOut) { + fwrite(hstPtr, 1, rgb_image_size, fpOut); + } + + if (hstPtr != nullptr) { + delete [] hstPtr; + hstPtr = nullptr; + } +} + +void ColorConvertYUV2RGB(uint8_t *p_src, OutputSurfaceInfo *surf_info, uint8_t *rgb_dev_mem_ptr, OutputFormatEnum e_output_format) { + + int rgb_width = (surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels + // todo:: get color standard from the decoder + if (surf_info->surface_format == rocDecVideoSurfaceFormat_YUV444) { + if (e_output_format == bgr) + YUV444ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == bgra) + YUV444ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgb) + YUV444ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgba) + YUV444ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + } else if (surf_info->surface_format == rocDecVideoSurfaceFormat_NV12) { + if (e_output_format == bgr) + Nv12ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == bgra) + Nv12ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgb) + Nv12ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgba) + Nv12ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + } + if (surf_info->surface_format == rocDecVideoSurfaceFormat_YUV444_16Bit) { + if (e_output_format == bgr) + YUV444P16ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgb) + YUV444P16ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == bgr48) + YUV444P16ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgb48) + YUV444P16ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == bgra64) + YUV444P16ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgba64) + YUV444P16ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + } else if (surf_info->surface_format == rocDecVideoSurfaceFormat_P016) { + if (e_output_format == bgr) + P016ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgb) + P016ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == bgr48) + P016ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgb48) + P016ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == bgra64) + P016ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + else if (e_output_format == rgba64) + P016ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width, + surf_info->output_height, surf_info->output_vstride, 0); + } +} + +int main(int argc, char **argv) { + + std::string input_file_path, output_file_path; + int dump_output_frames = 0; + int convert_to_rgb = 0; + int device_id = 0; + Rect crop_rect = {}; + Rect *p_crop_rect = nullptr; + size_t rgb_image_size; + uint32_t rgb_image_stride; + hipError_t hip_status = hipSuccess; + uint8_t *p_rgb_dev_mem= nullptr; + OutputSurfaceMemoryType mem_type = OUT_SURFACE_MEM_DEV_INTERNAL; // set to internal + OutputFormatEnum e_output_format = native; + int rgb_width; + + // Parse command-line arguments + if(argc < 1) { + ShowHelpAndExit(); + } + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-h")) { + ShowHelpAndExit(); + } + if (!strcmp(argv[i], "-i")) { + if (++i == argc) { + ShowHelpAndExit("-i"); + } + input_file_path = argv[i]; + continue; + } + if (!strcmp(argv[i], "-o")) { + if (++i == argc) { + ShowHelpAndExit("-o"); + } + output_file_path = argv[i]; + dump_output_frames = 1; + continue; + } + if (!strcmp(argv[i], "-d")) { + if (++i == argc) { + ShowHelpAndExit("-d"); + } + device_id = atoi(argv[i]); + continue; + } + if (!strcmp(argv[i], "-m")) { + if (++i == argc) { + ShowHelpAndExit("-m"); + } + mem_type = static_cast(atoi(argv[i])); + continue; + } + if (!strcmp(argv[i], "-crop")) { + if (++i == argc || 4 != sscanf(argv[i], "%d,%d,%d,%d", &crop_rect.l, &crop_rect.t, &crop_rect.r, &crop_rect.b)) { + ShowHelpAndExit("-crop"); + } + if ((crop_rect.r - crop_rect.l) % 2 == 1 || (crop_rect.b - crop_rect.t) % 2 == 1) { + std::cout << "output crop rectangle must have width and height of even numbers" << std::endl; + exit(1); + } + p_crop_rect = &crop_rect; + continue; + } + if (!strcmp(argv[i], "-of")) { + if (++i == argc) { + ShowHelpAndExit("-of"); + } + auto it = find(st_output_format_name.begin(), st_output_format_name.end(), argv[i]); + if (it == st_output_format_name.end()) { + ShowHelpAndExit("-of"); + } + e_output_format = (OutputFormatEnum)(it-st_output_format_name.begin()); + continue; + } + ShowHelpAndExit(argv[i]); + } + + try { + VideoDemuxer demuxer(input_file_path.c_str()); + rocDecVideoCodec rocdec_codec_id = AVCodec2RocDecVideoCodec(demuxer.GetCodecID()); + RocVideoDecoder viddec(device_id, mem_type, rocdec_codec_id, false, p_crop_rect); + + std::string device_name, gcn_arch_name; + int pci_bus_id, pci_domain_id, pci_device_id; + hipStream_t stream = viddec.GetStream(); + + viddec.GetDeviceinfo(device_name, gcn_arch_name, pci_bus_id, pci_domain_id, pci_device_id); + std::cout << "info: Using GPU device " << device_id << " " << device_name << "[" << gcn_arch_name << "] on PCI bus " << + std::setfill('0') << std::setw(2) << std::right << std::hex << pci_bus_id << ":" << std::setfill('0') << std::setw(2) << + std::right << std::hex << pci_domain_id << "." << pci_device_id << std::dec << std::endl; + std::cout << "info: decoding started, please wait!" << std::endl; + + int n_video_bytes = 0, n_frames_returned = 0, n_frame = 0; + uint8_t *p_video = nullptr; + uint8_t *p_frame = nullptr; + int64_t pts = 0; + OutputSurfaceInfo *surf_info; + uint32_t width, height; + double total_dec_time = 0; + convert_to_rgb = e_output_format != native; + + do { + auto startTime = std::chrono::high_resolution_clock::now(); + demuxer.Demux(&p_video, &n_video_bytes, &pts); + n_frames_returned = viddec.DecodeFrame(p_video, n_video_bytes, 0, pts); + auto end_time = std::chrono::high_resolution_clock::now(); + auto time_per_frame = std::chrono::duration(end_time - startTime).count(); + total_dec_time += time_per_frame; + if (!n_frame && !viddec.GetOutputSurfaceInfo(&surf_info)){ + std::cerr << "Error: Failed to get Output Image Info!" << std::endl; + break; + } + + for (int i = 0; i < n_frames_returned; i++) { + p_frame = viddec.GetFrame(&pts); + if (convert_to_rgb) { + if (surf_info->bit_depth == 8) { + rgb_width = (surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels + rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * surf_info->output_height * 3 : rgb_width * surf_info->output_height * 4; + } else { // 16bit + rgb_width = (surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels + rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * surf_info->output_height * 3 : ((e_output_format == bgr48) || (e_output_format == rgb48)) ? + rgb_width * surf_info->output_height * 6 : rgb_width * surf_info->output_height * 8; + } + if (p_rgb_dev_mem == nullptr) { + hip_status = hipMalloc(&p_rgb_dev_mem, rgb_image_size); + if (hip_status != hipSuccess) { + std::cerr << "ERROR: hipMalloc failed to allocate the device memory for the output!" << hip_status << std::endl; + return -1; + } + } + ColorConvertYUV2RGB(p_frame, surf_info, p_rgb_dev_mem, e_output_format); + } + if (dump_output_frames) { + if (convert_to_rgb) + DumpRGBImage(output_file_path, p_rgb_dev_mem, surf_info, rgb_image_size); + else + viddec.SaveFrameToFile(output_file_path, p_frame, surf_info); + } + // release frame + viddec.ReleaseFrame(pts); + } + n_frame += n_frames_returned; + } while (n_video_bytes); + + if (p_rgb_dev_mem != nullptr) { + hip_status = hipFree(p_rgb_dev_mem); + if (hip_status != hipSuccess) { + std::cout << "ERROR: hipFree failed! (" << hip_status << ")" << std::endl; + return -1; + } + } + if (fpOut) { + fclose(fpOut); + fpOut = nullptr; + } + + std::cout << "info: Video codec format: " << viddec.GetCodecFmtName(viddec.GetCodecId()) << std::endl; + std::cout << "info: Video size: [ " << surf_info->output_width << ", " << surf_info->output_height << " ]" << std::endl; + std::cout << "info: Video surface format: " << viddec.GetSurfaceFmtName(surf_info->surface_format) << std::endl; + std::cout << "info: Video Bit depth: " << surf_info->bit_depth << std::endl; + std::cout << "info: Total frame decoded: " << n_frame << std::endl; + if (!dump_output_frames) { + std::cout << "info: avg decoding time per frame (ms): " << total_dec_time / n_frame << std::endl; + std::cout << "info: avg FPS: " << (n_frame / total_dec_time) * 1000 << std::endl; + } + } catch (const std::exception &ex) { + std::cout << ex.what() << std::endl; + exit(1); + } + + return 0; +} diff --git a/utils/colorspace_kernels.cpp b/utils/colorspace_kernels.cpp new file mode 100644 index 0000000000..49e35740fa --- /dev/null +++ b/utils/colorspace_kernels.cpp @@ -0,0 +1,593 @@ +/* +Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "colorspace_kernels.h" +#include "roc_video_dec.h" + +__constant__ float yuv_to_rgb_mat[3][3]; +__constant__ float rgb_to_yuv_mat[3][3]; + + +void inline GetColMatCoefficients(int col_standard, float &wr, float &wb, int &black, int &white, int &max) { + black = 16; white = 235; + max = 255; + + switch (col_standard) + { + case ColorSpaceStandard_BT709: + default: + wr = 0.2126f; wb = 0.0722f; + break; + + case ColorSpaceStandard_FCC: + wr = 0.30f; wb = 0.11f; + break; + + case ColorSpaceStandard_BT470: + case ColorSpaceStandard_BT601: + wr = 0.2990f; wb = 0.1140f; + break; + + case ColorSpaceStandard_SMPTE240M: + wr = 0.212f; wb = 0.087f; + break; + + case ColorSpaceStandard_BT2020: + case ColorSpaceStandard_BT2020C: + wr = 0.2627f; wb = 0.0593f; + // 10-bit only + black = 64 << 6; white = 940 << 6; + max = (1 << 16) - 1; + break; + } +} + +void SetMatYuv2Rgb(int col_standard) { + float wr, wb; + int black, white, max; + GetColMatCoefficients(col_standard, wr, wb, black, white, max); + float mat[3][3] = { + 1.0f, 0.0f, (1.0f - wr) / 0.5f, + 1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr), + 1.0f, (1.0f - wb) / 0.5f, 0.0f, + }; + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]); + } + } + HIP_API_CALL(hipMemcpyToSymbol(yuv_to_rgb_mat, mat, sizeof(mat))); +} + +void SetMatRgb2Yuv(int col_standard) { + float wr, wb; + int black, white, max; + GetColMatCoefficients(col_standard, wr, wb, black, white, max); + float mat[3][3] = { + wr, 1.0f - wb - wr, wb, + -0.5f * wr / (1.0f - wb), -0.5f * (1 - wb - wr) / (1.0f - wb), 0.5f, + 0.5f, -0.5f * (1.0f - wb - wr) / (1.0f - wr), -0.5f * wb / (1.0f - wr), + }; + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + mat[i][j] = (float)(1.0 * (white - black) / max * mat[i][j]); + } + } + HIP_API_CALL(hipMemcpyToSymbol(rgb_to_yuv_mat, mat, sizeof(mat))); +} + +template +__device__ static T Clamp(T x, T lower, T upper) { + return x < lower ? lower : (x > upper ? upper : x); +} + +template +__device__ inline Rgb YuvToRgbForPixel(YuvUnit y, YuvUnit u, YuvUnit v) { + const int + low = 1 << (sizeof(YuvUnit) * 8 - 4), + mid = 1 << (sizeof(YuvUnit) * 8 - 1); + float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid; + const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f; + YuvUnit + r = (YuvUnit)Clamp(yuv_to_rgb_mat[0][0] * fy + yuv_to_rgb_mat[0][1] * fu + yuv_to_rgb_mat[0][2] * fv, 0.0f, maxf), + g = (YuvUnit)Clamp(yuv_to_rgb_mat[1][0] * fy + yuv_to_rgb_mat[1][1] * fu + yuv_to_rgb_mat[1][2] * fv, 0.0f, maxf), + b = (YuvUnit)Clamp(yuv_to_rgb_mat[2][0] * fy + yuv_to_rgb_mat[2][1] * fu + yuv_to_rgb_mat[2][2] * fv, 0.0f, maxf); + + Rgb rgb{}; + const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(rgb.c.r)) * 8; + if (sizeof(YuvUnit) >= sizeof(rgb.c.r)) { + rgb.c.r = r >> nShift; + rgb.c.g = g >> nShift; + rgb.c.b = b >> nShift; + } else { + rgb.c.r = r << nShift; + rgb.c.g = g << nShift; + rgb.c.b = b << nShift; + } + return rgb; +} + +// yuv to RGBA (32/64 bit) +template +__global__ static void YuvToRgbaKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; + if (x + 1 >= width || y + 1 >= height) { + return; + } + + uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; + + YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; + YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch); + YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch); + + *(RgbIntx2 *)p_dst = RgbIntx2 { + YuvToRgbForPixel(l0.x, ch.x, ch.y).d, + YuvToRgbForPixel(l0.y, ch.x, ch.y).d, + }; + *(RgbIntx2 *)(p_dst + rgb_pitch) = RgbIntx2 { + YuvToRgbForPixel(l1.x, ch.x, ch.y).d, + YuvToRgbForPixel(l1.y, ch.x, ch.y).d, + }; +} + +// yuv to RGB (24/48 bit) +template +__global__ static void YuvToRgbKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; + if (x + 1 >= width || y + 1 >= height) { + return; + } + + uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; + uint8_t *p_dst1 = p_dst + rgb_pitch; + + YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; + YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch); + YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch); + Rgb rgb0 = YuvToRgbForPixel(l0.x, ch.x, ch.y), + rgb1 = YuvToRgbForPixel(l0.y, ch.x, ch.y), + rgb2 = YuvToRgbForPixel(l1.x, ch.x, ch.y), + rgb3 = YuvToRgbForPixel(l1.y, ch.x, ch.y); + + *(RgbInt1 *)p_dst = RgbInt1 { rgb0.v.x, rgb0.v.y, rgb0.v.z, rgb1.v.x }; + *(RgbInt2 *)(p_dst + sizeof(RgbInt1)) = RgbInt2 { rgb1.v.y, rgb1.v.z }; + *(RgbInt1 *)(p_dst1) = RgbInt1 { rgb2.v.x, rgb2.v.y, rgb2.v.z, rgb3.v.x }; + *(RgbInt2 *)(p_dst1 + sizeof(RgbInt1)) = RgbInt2 { rgb3.v.y, rgb3.v.z }; +} + +// yuv444 to RGBA (32/64 bit) +template +__global__ static void Yuv444ToRgbaKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y); + if (x + 1 >= width || y >= height) { + return; + } + + uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; + + YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; + YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch)); + YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch)); + + *(RgbIntx2 *)p_dst = RgbIntx2{ + YuvToRgbForPixel(l0.x, ch1.x, ch2.x).d, + YuvToRgbForPixel(l0.y, ch1.y, ch2.y).d, + }; +} + +// yuv444 to RGB (24/48 bit) +template +__global__ static void Yuv444ToRgbKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y); + if (x + 1 >= width || y >= height) { + return; + } + + uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; + + YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; + YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch)); + YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch)); + Rgb rgb0 = YuvToRgbForPixel(l0.x, ch1.x, ch2.x), + rgb1 = YuvToRgbForPixel(l0.y, ch1.y, ch2.y); + + *(RgbInt1 *)p_dst = RgbInt1 { rgb0.v.x, rgb0.v.y, rgb0.v.z, rgb1.v.x }; + *(RgbInt2 *)(p_dst + sizeof(RgbInt1)) = RgbInt2 { rgb1.v.y, rgb1.v.z }; +} + + +template +__global__ static void YuvToRgbPlanarKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgbp, int nRgbpPitch, int width, int height, int v_pitch) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; + if (x + 1 >= width || y + 1 >= height) { + return; + } + + uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + + YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; + YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch); + YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch); + + Rgb rgb0 = YuvToRgbForPixel(l0.x, ch.x, ch.y), + rgb1 = YuvToRgbForPixel(l0.y, ch.x, ch.y), + rgb2 = YuvToRgbForPixel(l1.x, ch.x, ch.y), + rgb3 = YuvToRgbForPixel(l1.y, ch.x, ch.y); + + uint8_t *p_dst = dp_rgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch; + *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.x, rgb1.v.x}; + *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.x, rgb3.v.x}; + p_dst += nRgbpPitch * height; + *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.y, rgb1.v.y}; + *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.y, rgb3.v.y}; + p_dst += nRgbpPitch * height; + *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.z, rgb1.v.z}; + *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.z, rgb3.v.z}; +} + +template +__global__ static void Yuv444ToRgbPlanarKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgbp, int nRgbpPitch, int width, int height, int v_pitch) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y); + if (x + 1 >= width || y >= height) { + return; + } + + uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + + YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; + YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch)); + YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch)); + + Rgb rgb0 = YuvToRgbForPixel(l0.x, ch1.x, ch2.x), + rgb1 = YuvToRgbForPixel(l0.y, ch1.y, ch2.y); + + + uint8_t *p_dst = dp_rgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch; + *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.x, rgb1.v.x }; + + p_dst += nRgbpPitch * height; + *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.y, rgb1.v.y }; + + p_dst += nRgbpPitch * height; + *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.z, rgb1.v.z }; +} + +template +void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbaKernel + <<>> + (dp_nv12, nv12_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbaKernel + <<>> + (dp_nv12, nv12_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbaKernel + <<>> + (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbaKernel + <<>> + (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbaKernel + <<>> + (dp_p016, p016_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbaKernel + <<>> + (dp_p016, p016_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbaKernel + <<>> + (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbaKernel + <<>> + (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbPlanarKernel + <<>> + (dp_nv12, nv12_pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); +} + +template +void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbPlanarKernel + <<>> + (dp_p016, p016_pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); +} + +template +void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbPlanarKernel + <<>> + (dp_yuv_444, pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); +} + +template +void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbPlanarKernel + <<>> + (dp_yuv_444, pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); +} + +// Explicit Instantiation: for RGB32/BGR32 and RGB64/BGR64 formats +template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard); + +template +void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbKernel + <<>> + (dp_nv12, nv12_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + +template +void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbKernel + <<>> + (dp_nv12, nv12_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + +template +void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbKernel + <<>> + (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + +template +void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbKernel + <<>> + (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + +template +void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbKernel + <<>> + (dp_p016, p016_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + +template +void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + YuvToRgbKernel + <<>> + (dp_p016, p016_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + +template +void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbKernel + <<>> + (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); +} + +template +void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) { + SetMatYuv2Rgb(col_standard); + Yuv444ToRgbKernel + <<>> + (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch); +} + + +// Explicit Instantiation: for RGB24/BGR24 and RGB48/BGR48 formats +template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); +template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard); + + +template +__device__ inline YuvUnit RgbToY(RgbUnit r, RgbUnit g, RgbUnit b) { + const YuvUnit low = 1 << (sizeof(YuvUnit) * 8 - 4); + return rgb_to_yuv_mat[0][0] * r + rgb_to_yuv_mat[0][1] * g + rgb_to_yuv_mat[0][2] * b + low; +} + +template +__device__ inline YuvUnit RgbToU(RgbUnit r, RgbUnit g, RgbUnit b) { + const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1); + return rgb_to_yuv_mat[1][0] * r + rgb_to_yuv_mat[1][1] * g + rgb_to_yuv_mat[1][2] * b + mid; +} + +template +__device__ inline YuvUnit RgbToV(RgbUnit r, RgbUnit g, RgbUnit b) { + const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1); + return rgb_to_yuv_mat[2][0] * r + rgb_to_yuv_mat[2][1] * g + rgb_to_yuv_mat[2][2] * b + mid; +} + +template +__global__ static void RgbaToYuvKernel(uint8_t *dp_rgb, int rgba_pitch, uint8_t *dp_yuv, int yuv_pitch, int width, int height) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; + if (x + 1 >= width || y + 1 >= height) { + return; + } + + uint8_t *p_src = dp_rgb + x * sizeof(Rgb) + y * rgba_pitch; + RgbIntx2 int2a = *(RgbIntx2 *)p_src; + RgbIntx2 int2b = *(RgbIntx2 *)(p_src + rgba_pitch); + + Rgb rgb[4] = {int2a.x, int2a.y, int2b.x, int2b.y}; + decltype(Rgb::c.r) + r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4, + g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4, + b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4; + + uint8_t *p_dst = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + *(YuvUnitx2 *)p_dst = YuvUnitx2 { + RgbToY(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b), + RgbToY(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b), + }; + *(YuvUnitx2 *)(p_dst + yuv_pitch) = YuvUnitx2 { + RgbToY(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b), + RgbToY(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b), + }; + *(YuvUnitx2 *)(p_dst + (height - y / 2) * yuv_pitch) = YuvUnitx2 { + RgbToU(r, g, b), + RgbToV(r, g, b), + }; +} + +void Bgra64ToP016(uint8_t *dp_bgra, int bgra_pitch, uint8_t *dp_p016, int p016_pitch, int width, int height, int col_standard) { + SetMatRgb2Yuv(col_standard); + RgbaToYuvKernel + <<>> + (dp_bgra, bgra_pitch, dp_p016, p016_pitch, width, height); +} + +template +__global__ static void RgbToYuvKernel(uint8_t *dp_rgb, int rgb_pitch, uint8_t *dp_yuv, int yuv_pitch, int width, int height) { + int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; + if (x + 1 >= width || y + 1 >= height) { + return; + } + + uint8_t *p_src = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; + RgbInt1 int1a = *(RgbInt1 *)p_src; + RgbInt2 int2a = *(RgbInt2 *)(p_src + sizeof(RgbInt1)); + RgbInt1 int1b = *(RgbInt1 *)(p_src + rgb_pitch); + RgbInt2 int2b = *(RgbInt2 *)(p_src + rgb_pitch + sizeof(RgbInt1)); + + Rgb rgb[4]; + rgb[0].v = {int1a.x, int1a.y, int1a.z}, + rgb[1].v = {int1a.w, int2a.x, int2a.y}, + rgb[2].v = {int1b.x, int1b.y, int1b.z}, + rgb[3].v = {int1b.w, int2b.x, int2b.y}; + decltype(Rgb::c.r) + r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4, + g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4, + b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4; + + uint8_t *p_dst = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; + *(YuvUnitx2 *)p_dst = YuvUnitx2 { + RgbToY(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b), + RgbToY(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b), + }; + *(YuvUnitx2 *)(p_dst + yuv_pitch) = YuvUnitx2 { + RgbToY(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b), + RgbToY(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b), + }; + *(YuvUnitx2 *)(p_dst + (height - y / 2) * yuv_pitch) = YuvUnitx2 { + RgbToU(r, g, b), + RgbToV(r, g, b), + }; +} + +void Bgr48ToP016(uint8_t *p_bgr, int bgr_pitch, uint8_t *dp_p016, int p016_pitch, int width, int height, int col_standard) { + SetMatRgb2Yuv(col_standard); + RgbToYuvKernel + <<>> + (p_bgr, bgr_pitch, dp_p016, p016_pitch, width, height); +} diff --git a/utils/colorspace_kernels.h b/utils/colorspace_kernels.h new file mode 100644 index 0000000000..c600de8582 --- /dev/null +++ b/utils/colorspace_kernels.h @@ -0,0 +1,143 @@ + +/* +Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once +#include +#include + +/*! + * \file + * \brief The AMD Color Space Standards for VCN Decode Library. + * + * \defgroup group_amd_vcn_colorspace colorSpace: AMD VCN Color Space API + * \brief AMD The vcnDECODE Color Space API. + */ + +typedef enum ColorSpaceStandard_ { + ColorSpaceStandard_BT709 = 1, + ColorSpaceStandard_Unspecified = 2, + ColorSpaceStandard_Reserved = 3, + ColorSpaceStandard_FCC = 4, + ColorSpaceStandard_BT470 = 5, + ColorSpaceStandard_BT601 = 6, + ColorSpaceStandard_SMPTE240M = 7, + ColorSpaceStandard_YCgCo = 8, + ColorSpaceStandard_BT2020 = 9, + ColorSpaceStandard_BT2020C = 10 +} ColorSpaceStandard; + +union BGR24 { + uchar3 v; + struct { + uint8_t b, g, r; + } c; +}; + +union RGB24 { + uchar3 v; + struct { + uint8_t r, g, b; + } c; +}; + +union BGR48 { + ushort3 v; + struct { + uint16_t b, g, r; + } c; +}; + +union RGB48 { + ushort3 v; + struct { + uint16_t r, g, b; + } c; +}; + +union BGRA32 { + uint32_t d; + uchar4 v; + struct { + uint8_t b, g, r, a; + } c; +}; + +union RGBA32 { + uint32_t d; + uchar4 v; + struct { + uint8_t r, g, b, a; + } c; +}; + +union BGRA64 { + uint64_t d; + ushort4 v; + struct { + uint16_t b, g, r, a; + } c; +}; + +union RGBA64 { + uint64_t d; + ushort4 v; + struct { + uint16_t r, g, b, a; + } c; +}; + +// color-convert hip kernel function definitions +template +void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); + +template +void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); +template +void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); +template +void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard); +template +void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); +template +void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard); + diff --git a/utils/rocvideodecode/commons.h b/utils/rocvideodecode/commons.h deleted file mode 100644 index 46aca6ba8f..0000000000 --- a/utils/rocvideodecode/commons.h +++ /dev/null @@ -1,51 +0,0 @@ -/* -Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#pragma once -#include -#include -#include -#include - -#define TOSTR(X) std::to_string(static_cast(X)) -#define STR(X) std::string(X) - -#if DBGINFO -#define INFO(X) std::clog << "[INF] " << " {" << __func__ <<"} " << " " << X << std::endl; -#else -#define INFO(X) ; -#endif -#define ERR(X) std::cerr << "[ERR] " << " {" << __func__ <<"} " << " " << X << std::endl; - - -class rocVideoDecodeException : public std::exception { -public: - - explicit rocVideoDecodeException(const std::string& message):_message(message){} - virtual const char* what() const throw() override { - return _message.c_str(); - } -private: - std::string _message; -}; - -#define THROW(X) throw rocVideoDecodeException(" { "+std::string(__func__)+" } " + X); diff --git a/utils/rocvideodecode/roc_video_dec.h b/utils/rocvideodecode/roc_video_dec.h index 00a16058b5..ecb9353273 100644 --- a/utils/rocvideodecode/roc_video_dec.h +++ b/utils/rocvideodecode/roc_video_dec.h @@ -158,6 +158,9 @@ class RocVideoDecoder { ~RocVideoDecoder(); rocDecVideoCodec GetCodecId() { return codec_id_; } + + hipStream_t GetStream() {return hip_stream_;} + /** * @brief Get the output frame width */