diff --git a/samples/videoDecodeRGB/CMakeLists.txt b/samples/videoDecodeRGB/CMakeLists.txt
new file mode 100644
index 0000000000..a30908cb7b
--- /dev/null
+++ b/samples/videoDecodeRGB/CMakeLists.txt
@@ -0,0 +1,82 @@
+################################################################################
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+cmake_minimum_required (VERSION 3.5)
+project(videodecodergb)
+set(CMAKE_CXX_STANDARD 17)
+
+# ROCM Path
+if(DEFINED ENV{ROCM_PATH})
+ set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Default ROCm installation path")
+elseif(ROCM_PATH)
+ message("-- INFO:ROCM_PATH Set -- ${ROCM_PATH}")
+else()
+ set(ROCM_PATH /opt/rocm CACHE PATH "Default ROCm installation path")
+endif()
+
+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake)
+list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
+set(CMAKE_CXX_COMPILER ${ROCM_PATH}/llvm/bin/clang++)
+
+set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx940;gfx1030;gfx1031;gfx1032;gfx1100")
+set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+find_package(HIP QUIET)
+find_package(FFmpeg QUIET)
+# find rocDecode
+find_library(ROCDECODE_LIBRARY NAMES rocdecode HINTS {ROCM_PATH}/lib)
+find_path(ROCDECODE_INCLUDE_DIR NAMES rocdecode.h PATHS /opt/rocm/include/rocdecode {ROCM_PATH}/include/rocdecode)
+
+if(ROCDECODE_LIBRARY AND ROCDECODE_INCLUDE_DIR)
+ set(ROCDECODE_FOUND TRUE)
+ message("-- ${White}Using rocDecode -- \n\tLibraries:${ROCDECODE_LIBRARY} \n\tIncludes:${ROCDECODE_INCLUDE_DIR}${ColourReset}")
+endif()
+
+if(HIP_FOUND AND FFMPEG_FOUND AND ROCDECODE_FOUND)
+ # HIP
+ set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} hip::device)
+ # FFMPEG
+ include_directories(${AVUTIL_INCLUDE_DIR} ${AVCODEC_INCLUDE_DIR})
+ set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} ${FFMPEG_LIBRARIES})
+ # rocDecode and utils
+ include_directories (${CMAKE_CURRENT_SOURCE_DIR}/../../utils ${ROCDECODE_INCLUDE_DIR})
+ set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} ${ROCDECODE_LIBRARY})
+ list(APPEND SOURCES ${PROJECT_SOURCE_DIR}
+ videodecrgb.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/rocvideodecode/roc_video_dec.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/colorspace_kernels.cpp)
+
+ add_executable(${PROJECT_NAME} ${SOURCES})
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
+ target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST})
+else()
+ message("-- ERROR!: ${PROJECT_NAME} excluded! please install all the dependencies and try again!")
+ if (NOT HIP_FOUND)
+ message(FATAL_ERROR "-- ERROR!: HIP Not Found! - please install ROCm and HIP!")
+ endif()
+ if (NOT FFMPEG_FOUND)
+ message(FATAL_ERROR "-- ERROR!: FFMPEG Not Found! - please install FFMPEG!")
+ endif()
+ if (NOT ROCDECODE_FOUND)
+ message(FATAL_ERROR "-- ERROR!: rocDecode Not Found! - please install rocDecode!")
+ endif()
+endif()
diff --git a/samples/videoDecodeRGB/README.md b/samples/videoDecodeRGB/README.md
new file mode 100644
index 0000000000..db2efb433e
--- /dev/null
+++ b/samples/videoDecodeRGB/README.md
@@ -0,0 +1,11 @@
+# Video Decode Sample
+This sample illustrates the FFMPEG demuxer to get the individual frames which are then decoded using rocDecode API and optionally color-converted using custom HIP kernels on AMD hardware. This sample converts decoded YUV output to one of the RGB or BGR formats(24bit, 32bit, 464bit)
+
+## Build and run the sample:
+```
+mkdir build
+cd build
+cmake ..
+make -j
+./videodecodergb -i -o -d -of
+```
\ No newline at end of file
diff --git a/samples/videoDecodeRGB/videodecrgb.cpp b/samples/videoDecodeRGB/videodecrgb.cpp
new file mode 100644
index 0000000000..41a1cef2a5
--- /dev/null
+++ b/samples/videoDecodeRGB/videodecrgb.cpp
@@ -0,0 +1,325 @@
+/*
+Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#if __cplusplus >= 201703L && __has_include()
+ #include
+#else
+ #include
+#endif
+#include "video_demuxer.h"
+#include "roc_video_dec.h"
+#include "colorspace_kernels.h"
+
+FILE *fpOut = nullptr;
+enum OutputFormatEnum {
+ native = 0, bgr, bgr48, rgb, rgb48, bgra, bgra64, rgba, rgba64
+};
+std::vector st_output_format_name = {"native", "bgr", "bgr48", "rgb", "rgb48", "bgra", "bgra64", "rgba", "rgba64"};
+
+void ShowHelpAndExit(const char *option = NULL) {
+ std::cout << "Options:" << std::endl
+ << "-i Input File Path - required" << std::endl
+ << "-o Output File Path - dumps output if requested; optional" << std::endl
+ << "-d GPU device ID (0 for the first device, 1 for the second, etc.); optional; default: 0" << std::endl
+ << "-of Output Format name - (native, bgr, bgr48, rgb, rgb48, bgra, bgra64, rgba, rgba64; converts native YUV frame to RGB image format; optional; default: 0" << std::endl
+ << "-crop crop rectangle for output (not used when using interopped decoded frame); optional; default: 0" << std::endl
+ << "-m output_surface_memory_type - decoded surface memory; optional; default - 0" << std::endl;
+
+ exit(0);
+}
+
+void DumpRGBImage(std::string outputfileName, void* pdevMem, OutputSurfaceInfo *surf_info, int rgb_image_size) {
+ if (fpOut == nullptr) {
+ fpOut = fopen(outputfileName.c_str(), "wb");
+ }
+ uint8_t *hstPtr = nullptr;
+ hstPtr = new uint8_t [rgb_image_size];
+ hipError_t hip_status = hipSuccess;
+ hip_status = hipMemcpyDtoH((void *)hstPtr, pdevMem, rgb_image_size);
+ if (hip_status != hipSuccess) {
+ std::cout << "ERROR: hipMemcpyDtoH failed! (" << hip_status << ")" << std::endl;
+ delete [] hstPtr;
+ return;
+ }
+ if (fpOut) {
+ fwrite(hstPtr, 1, rgb_image_size, fpOut);
+ }
+
+ if (hstPtr != nullptr) {
+ delete [] hstPtr;
+ hstPtr = nullptr;
+ }
+}
+
+void ColorConvertYUV2RGB(uint8_t *p_src, OutputSurfaceInfo *surf_info, uint8_t *rgb_dev_mem_ptr, OutputFormatEnum e_output_format) {
+
+ int rgb_width = (surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels
+ // todo:: get color standard from the decoder
+ if (surf_info->surface_format == rocDecVideoSurfaceFormat_YUV444) {
+ if (e_output_format == bgr)
+ YUV444ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == bgra)
+ YUV444ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgb)
+ YUV444ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgba)
+ YUV444ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ } else if (surf_info->surface_format == rocDecVideoSurfaceFormat_NV12) {
+ if (e_output_format == bgr)
+ Nv12ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == bgra)
+ Nv12ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgb)
+ Nv12ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgba)
+ Nv12ToColor32(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 4 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ }
+ if (surf_info->surface_format == rocDecVideoSurfaceFormat_YUV444_16Bit) {
+ if (e_output_format == bgr)
+ YUV444P16ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgb)
+ YUV444P16ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == bgr48)
+ YUV444P16ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgb48)
+ YUV444P16ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == bgra64)
+ YUV444P16ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgba64)
+ YUV444P16ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ } else if (surf_info->surface_format == rocDecVideoSurfaceFormat_P016) {
+ if (e_output_format == bgr)
+ P016ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgb)
+ P016ToColor24(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 3 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == bgr48)
+ P016ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgb48)
+ P016ToColor48(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 6 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == bgra64)
+ P016ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ else if (e_output_format == rgba64)
+ P016ToColor64(p_src, surf_info->output_pitch, static_cast(rgb_dev_mem_ptr), 8 * rgb_width, surf_info->output_width,
+ surf_info->output_height, surf_info->output_vstride, 0);
+ }
+}
+
+int main(int argc, char **argv) {
+
+ std::string input_file_path, output_file_path;
+ int dump_output_frames = 0;
+ int convert_to_rgb = 0;
+ int device_id = 0;
+ Rect crop_rect = {};
+ Rect *p_crop_rect = nullptr;
+ size_t rgb_image_size;
+ uint32_t rgb_image_stride;
+ hipError_t hip_status = hipSuccess;
+ uint8_t *p_rgb_dev_mem= nullptr;
+ OutputSurfaceMemoryType mem_type = OUT_SURFACE_MEM_DEV_INTERNAL; // set to internal
+ OutputFormatEnum e_output_format = native;
+ int rgb_width;
+
+ // Parse command-line arguments
+ if(argc < 1) {
+ ShowHelpAndExit();
+ }
+ for (int i = 1; i < argc; i++) {
+ if (!strcmp(argv[i], "-h")) {
+ ShowHelpAndExit();
+ }
+ if (!strcmp(argv[i], "-i")) {
+ if (++i == argc) {
+ ShowHelpAndExit("-i");
+ }
+ input_file_path = argv[i];
+ continue;
+ }
+ if (!strcmp(argv[i], "-o")) {
+ if (++i == argc) {
+ ShowHelpAndExit("-o");
+ }
+ output_file_path = argv[i];
+ dump_output_frames = 1;
+ continue;
+ }
+ if (!strcmp(argv[i], "-d")) {
+ if (++i == argc) {
+ ShowHelpAndExit("-d");
+ }
+ device_id = atoi(argv[i]);
+ continue;
+ }
+ if (!strcmp(argv[i], "-m")) {
+ if (++i == argc) {
+ ShowHelpAndExit("-m");
+ }
+ mem_type = static_cast(atoi(argv[i]));
+ continue;
+ }
+ if (!strcmp(argv[i], "-crop")) {
+ if (++i == argc || 4 != sscanf(argv[i], "%d,%d,%d,%d", &crop_rect.l, &crop_rect.t, &crop_rect.r, &crop_rect.b)) {
+ ShowHelpAndExit("-crop");
+ }
+ if ((crop_rect.r - crop_rect.l) % 2 == 1 || (crop_rect.b - crop_rect.t) % 2 == 1) {
+ std::cout << "output crop rectangle must have width and height of even numbers" << std::endl;
+ exit(1);
+ }
+ p_crop_rect = &crop_rect;
+ continue;
+ }
+ if (!strcmp(argv[i], "-of")) {
+ if (++i == argc) {
+ ShowHelpAndExit("-of");
+ }
+ auto it = find(st_output_format_name.begin(), st_output_format_name.end(), argv[i]);
+ if (it == st_output_format_name.end()) {
+ ShowHelpAndExit("-of");
+ }
+ e_output_format = (OutputFormatEnum)(it-st_output_format_name.begin());
+ continue;
+ }
+ ShowHelpAndExit(argv[i]);
+ }
+
+ try {
+ VideoDemuxer demuxer(input_file_path.c_str());
+ rocDecVideoCodec rocdec_codec_id = AVCodec2RocDecVideoCodec(demuxer.GetCodecID());
+ RocVideoDecoder viddec(device_id, mem_type, rocdec_codec_id, false, p_crop_rect);
+
+ std::string device_name, gcn_arch_name;
+ int pci_bus_id, pci_domain_id, pci_device_id;
+ hipStream_t stream = viddec.GetStream();
+
+ viddec.GetDeviceinfo(device_name, gcn_arch_name, pci_bus_id, pci_domain_id, pci_device_id);
+ std::cout << "info: Using GPU device " << device_id << " " << device_name << "[" << gcn_arch_name << "] on PCI bus " <<
+ std::setfill('0') << std::setw(2) << std::right << std::hex << pci_bus_id << ":" << std::setfill('0') << std::setw(2) <<
+ std::right << std::hex << pci_domain_id << "." << pci_device_id << std::dec << std::endl;
+ std::cout << "info: decoding started, please wait!" << std::endl;
+
+ int n_video_bytes = 0, n_frames_returned = 0, n_frame = 0;
+ uint8_t *p_video = nullptr;
+ uint8_t *p_frame = nullptr;
+ int64_t pts = 0;
+ OutputSurfaceInfo *surf_info;
+ uint32_t width, height;
+ double total_dec_time = 0;
+ convert_to_rgb = e_output_format != native;
+
+ do {
+ auto startTime = std::chrono::high_resolution_clock::now();
+ demuxer.Demux(&p_video, &n_video_bytes, &pts);
+ n_frames_returned = viddec.DecodeFrame(p_video, n_video_bytes, 0, pts);
+ auto end_time = std::chrono::high_resolution_clock::now();
+ auto time_per_frame = std::chrono::duration(end_time - startTime).count();
+ total_dec_time += time_per_frame;
+ if (!n_frame && !viddec.GetOutputSurfaceInfo(&surf_info)){
+ std::cerr << "Error: Failed to get Output Image Info!" << std::endl;
+ break;
+ }
+
+ for (int i = 0; i < n_frames_returned; i++) {
+ p_frame = viddec.GetFrame(&pts);
+ if (convert_to_rgb) {
+ if (surf_info->bit_depth == 8) {
+ rgb_width = (surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels
+ rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * surf_info->output_height * 3 : rgb_width * surf_info->output_height * 4;
+ } else { // 16bit
+ rgb_width = (surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels
+ rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * surf_info->output_height * 3 : ((e_output_format == bgr48) || (e_output_format == rgb48)) ?
+ rgb_width * surf_info->output_height * 6 : rgb_width * surf_info->output_height * 8;
+ }
+ if (p_rgb_dev_mem == nullptr) {
+ hip_status = hipMalloc(&p_rgb_dev_mem, rgb_image_size);
+ if (hip_status != hipSuccess) {
+ std::cerr << "ERROR: hipMalloc failed to allocate the device memory for the output!" << hip_status << std::endl;
+ return -1;
+ }
+ }
+ ColorConvertYUV2RGB(p_frame, surf_info, p_rgb_dev_mem, e_output_format);
+ }
+ if (dump_output_frames) {
+ if (convert_to_rgb)
+ DumpRGBImage(output_file_path, p_rgb_dev_mem, surf_info, rgb_image_size);
+ else
+ viddec.SaveFrameToFile(output_file_path, p_frame, surf_info);
+ }
+ // release frame
+ viddec.ReleaseFrame(pts);
+ }
+ n_frame += n_frames_returned;
+ } while (n_video_bytes);
+
+ if (p_rgb_dev_mem != nullptr) {
+ hip_status = hipFree(p_rgb_dev_mem);
+ if (hip_status != hipSuccess) {
+ std::cout << "ERROR: hipFree failed! (" << hip_status << ")" << std::endl;
+ return -1;
+ }
+ }
+ if (fpOut) {
+ fclose(fpOut);
+ fpOut = nullptr;
+ }
+
+ std::cout << "info: Video codec format: " << viddec.GetCodecFmtName(viddec.GetCodecId()) << std::endl;
+ std::cout << "info: Video size: [ " << surf_info->output_width << ", " << surf_info->output_height << " ]" << std::endl;
+ std::cout << "info: Video surface format: " << viddec.GetSurfaceFmtName(surf_info->surface_format) << std::endl;
+ std::cout << "info: Video Bit depth: " << surf_info->bit_depth << std::endl;
+ std::cout << "info: Total frame decoded: " << n_frame << std::endl;
+ if (!dump_output_frames) {
+ std::cout << "info: avg decoding time per frame (ms): " << total_dec_time / n_frame << std::endl;
+ std::cout << "info: avg FPS: " << (n_frame / total_dec_time) * 1000 << std::endl;
+ }
+ } catch (const std::exception &ex) {
+ std::cout << ex.what() << std::endl;
+ exit(1);
+ }
+
+ return 0;
+}
diff --git a/utils/colorspace_kernels.cpp b/utils/colorspace_kernels.cpp
new file mode 100644
index 0000000000..49e35740fa
--- /dev/null
+++ b/utils/colorspace_kernels.cpp
@@ -0,0 +1,593 @@
+/*
+Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "colorspace_kernels.h"
+#include "roc_video_dec.h"
+
+__constant__ float yuv_to_rgb_mat[3][3];
+__constant__ float rgb_to_yuv_mat[3][3];
+
+
+void inline GetColMatCoefficients(int col_standard, float &wr, float &wb, int &black, int &white, int &max) {
+ black = 16; white = 235;
+ max = 255;
+
+ switch (col_standard)
+ {
+ case ColorSpaceStandard_BT709:
+ default:
+ wr = 0.2126f; wb = 0.0722f;
+ break;
+
+ case ColorSpaceStandard_FCC:
+ wr = 0.30f; wb = 0.11f;
+ break;
+
+ case ColorSpaceStandard_BT470:
+ case ColorSpaceStandard_BT601:
+ wr = 0.2990f; wb = 0.1140f;
+ break;
+
+ case ColorSpaceStandard_SMPTE240M:
+ wr = 0.212f; wb = 0.087f;
+ break;
+
+ case ColorSpaceStandard_BT2020:
+ case ColorSpaceStandard_BT2020C:
+ wr = 0.2627f; wb = 0.0593f;
+ // 10-bit only
+ black = 64 << 6; white = 940 << 6;
+ max = (1 << 16) - 1;
+ break;
+ }
+}
+
+void SetMatYuv2Rgb(int col_standard) {
+ float wr, wb;
+ int black, white, max;
+ GetColMatCoefficients(col_standard, wr, wb, black, white, max);
+ float mat[3][3] = {
+ 1.0f, 0.0f, (1.0f - wr) / 0.5f,
+ 1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr),
+ 1.0f, (1.0f - wb) / 0.5f, 0.0f,
+ };
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+ }
+ }
+ HIP_API_CALL(hipMemcpyToSymbol(yuv_to_rgb_mat, mat, sizeof(mat)));
+}
+
+void SetMatRgb2Yuv(int col_standard) {
+ float wr, wb;
+ int black, white, max;
+ GetColMatCoefficients(col_standard, wr, wb, black, white, max);
+ float mat[3][3] = {
+ wr, 1.0f - wb - wr, wb,
+ -0.5f * wr / (1.0f - wb), -0.5f * (1 - wb - wr) / (1.0f - wb), 0.5f,
+ 0.5f, -0.5f * (1.0f - wb - wr) / (1.0f - wr), -0.5f * wb / (1.0f - wr),
+ };
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ mat[i][j] = (float)(1.0 * (white - black) / max * mat[i][j]);
+ }
+ }
+ HIP_API_CALL(hipMemcpyToSymbol(rgb_to_yuv_mat, mat, sizeof(mat)));
+}
+
+template
+__device__ static T Clamp(T x, T lower, T upper) {
+ return x < lower ? lower : (x > upper ? upper : x);
+}
+
+template
+__device__ inline Rgb YuvToRgbForPixel(YuvUnit y, YuvUnit u, YuvUnit v) {
+ const int
+ low = 1 << (sizeof(YuvUnit) * 8 - 4),
+ mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+ float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid;
+ const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
+ YuvUnit
+ r = (YuvUnit)Clamp(yuv_to_rgb_mat[0][0] * fy + yuv_to_rgb_mat[0][1] * fu + yuv_to_rgb_mat[0][2] * fv, 0.0f, maxf),
+ g = (YuvUnit)Clamp(yuv_to_rgb_mat[1][0] * fy + yuv_to_rgb_mat[1][1] * fu + yuv_to_rgb_mat[1][2] * fv, 0.0f, maxf),
+ b = (YuvUnit)Clamp(yuv_to_rgb_mat[2][0] * fy + yuv_to_rgb_mat[2][1] * fu + yuv_to_rgb_mat[2][2] * fv, 0.0f, maxf);
+
+ Rgb rgb{};
+ const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(rgb.c.r)) * 8;
+ if (sizeof(YuvUnit) >= sizeof(rgb.c.r)) {
+ rgb.c.r = r >> nShift;
+ rgb.c.g = g >> nShift;
+ rgb.c.b = b >> nShift;
+ } else {
+ rgb.c.r = r << nShift;
+ rgb.c.g = g << nShift;
+ rgb.c.b = b << nShift;
+ }
+ return rgb;
+}
+
+// yuv to RGBA (32/64 bit)
+template
+__global__ static void YuvToRgbaKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+ if (x + 1 >= width || y + 1 >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+ uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch;
+
+ YuvUnitx2 l0 = *(YuvUnitx2 *)p_src;
+ YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch);
+ YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch);
+
+ *(RgbIntx2 *)p_dst = RgbIntx2 {
+ YuvToRgbForPixel(l0.x, ch.x, ch.y).d,
+ YuvToRgbForPixel(l0.y, ch.x, ch.y).d,
+ };
+ *(RgbIntx2 *)(p_dst + rgb_pitch) = RgbIntx2 {
+ YuvToRgbForPixel(l1.x, ch.x, ch.y).d,
+ YuvToRgbForPixel(l1.y, ch.x, ch.y).d,
+ };
+}
+
+// yuv to RGB (24/48 bit)
+template
+__global__ static void YuvToRgbKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+ if (x + 1 >= width || y + 1 >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+ uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch;
+ uint8_t *p_dst1 = p_dst + rgb_pitch;
+
+ YuvUnitx2 l0 = *(YuvUnitx2 *)p_src;
+ YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch);
+ YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch);
+ Rgb rgb0 = YuvToRgbForPixel(l0.x, ch.x, ch.y),
+ rgb1 = YuvToRgbForPixel(l0.y, ch.x, ch.y),
+ rgb2 = YuvToRgbForPixel(l1.x, ch.x, ch.y),
+ rgb3 = YuvToRgbForPixel(l1.y, ch.x, ch.y);
+
+ *(RgbInt1 *)p_dst = RgbInt1 { rgb0.v.x, rgb0.v.y, rgb0.v.z, rgb1.v.x };
+ *(RgbInt2 *)(p_dst + sizeof(RgbInt1)) = RgbInt2 { rgb1.v.y, rgb1.v.z };
+ *(RgbInt1 *)(p_dst1) = RgbInt1 { rgb2.v.x, rgb2.v.y, rgb2.v.z, rgb3.v.x };
+ *(RgbInt2 *)(p_dst1 + sizeof(RgbInt1)) = RgbInt2 { rgb3.v.y, rgb3.v.z };
+}
+
+// yuv444 to RGBA (32/64 bit)
+template
+__global__ static void Yuv444ToRgbaKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y);
+ if (x + 1 >= width || y >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+ uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch;
+
+ YuvUnitx2 l0 = *(YuvUnitx2 *)p_src;
+ YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch));
+ YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch));
+
+ *(RgbIntx2 *)p_dst = RgbIntx2{
+ YuvToRgbForPixel(l0.x, ch1.x, ch2.x).d,
+ YuvToRgbForPixel(l0.y, ch1.y, ch2.y).d,
+ };
+}
+
+// yuv444 to RGB (24/48 bit)
+template
+__global__ static void Yuv444ToRgbKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y);
+ if (x + 1 >= width || y >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+ uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch;
+
+ YuvUnitx2 l0 = *(YuvUnitx2 *)p_src;
+ YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch));
+ YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch));
+ Rgb rgb0 = YuvToRgbForPixel(l0.x, ch1.x, ch2.x),
+ rgb1 = YuvToRgbForPixel(l0.y, ch1.y, ch2.y);
+
+ *(RgbInt1 *)p_dst = RgbInt1 { rgb0.v.x, rgb0.v.y, rgb0.v.z, rgb1.v.x };
+ *(RgbInt2 *)(p_dst + sizeof(RgbInt1)) = RgbInt2 { rgb1.v.y, rgb1.v.z };
+}
+
+
+template
+__global__ static void YuvToRgbPlanarKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgbp, int nRgbpPitch, int width, int height, int v_pitch) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+ if (x + 1 >= width || y + 1 >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+
+ YuvUnitx2 l0 = *(YuvUnitx2 *)p_src;
+ YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch);
+ YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch);
+
+ Rgb rgb0 = YuvToRgbForPixel(l0.x, ch.x, ch.y),
+ rgb1 = YuvToRgbForPixel(l0.y, ch.x, ch.y),
+ rgb2 = YuvToRgbForPixel(l1.x, ch.x, ch.y),
+ rgb3 = YuvToRgbForPixel(l1.y, ch.x, ch.y);
+
+ uint8_t *p_dst = dp_rgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch;
+ *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.x, rgb1.v.x};
+ *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.x, rgb3.v.x};
+ p_dst += nRgbpPitch * height;
+ *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.y, rgb1.v.y};
+ *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.y, rgb3.v.y};
+ p_dst += nRgbpPitch * height;
+ *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.z, rgb1.v.z};
+ *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.z, rgb3.v.z};
+}
+
+template
+__global__ static void Yuv444ToRgbPlanarKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgbp, int nRgbpPitch, int width, int height, int v_pitch) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y);
+ if (x + 1 >= width || y >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+
+ YuvUnitx2 l0 = *(YuvUnitx2 *)p_src;
+ YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch));
+ YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch));
+
+ Rgb rgb0 = YuvToRgbForPixel(l0.x, ch1.x, ch2.x),
+ rgb1 = YuvToRgbForPixel(l0.y, ch1.y, ch2.y);
+
+
+ uint8_t *p_dst = dp_rgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch;
+ *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.x, rgb1.v.x };
+
+ p_dst += nRgbpPitch * height;
+ *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.y, rgb1.v.y };
+
+ p_dst += nRgbpPitch * height;
+ *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.z, rgb1.v.z };
+}
+
+template
+void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbaKernel
+ <<>>
+ (dp_nv12, nv12_pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbaKernel
+ <<>>
+ (dp_nv12, nv12_pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbaKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbaKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbaKernel
+ <<>>
+ (dp_p016, p016_pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbaKernel
+ <<>>
+ (dp_p016, p016_pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbaKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbaKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbPlanarKernel
+ <<>>
+ (dp_nv12, nv12_pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch);
+}
+
+template
+void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbPlanarKernel
+ <<>>
+ (dp_p016, p016_pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch);
+}
+
+template
+void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbPlanarKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch);
+}
+
+template
+void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbPlanarKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch);
+}
+
+// Explicit Instantiation: for RGB32/BGR32 and RGB64/BGR64 formats
+template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard);
+
+template
+void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbKernel
+ <<>>
+ (dp_nv12, nv12_pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+template
+void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbKernel
+ <<>>
+ (dp_nv12, nv12_pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+template
+void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbKernel
+ <<>>
+ (dp_p016, p016_pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+template
+void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ YuvToRgbKernel
+ <<>>
+ (dp_p016, p016_pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch);
+}
+
+template
+void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard) {
+ SetMatYuv2Rgb(col_standard);
+ Yuv444ToRgbKernel
+ <<>>
+ (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch);
+}
+
+
+// Explicit Instantiation: for RGB24/BGR24 and RGB48/BGR48 formats
+template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard);
+
+
+template
+__device__ inline YuvUnit RgbToY(RgbUnit r, RgbUnit g, RgbUnit b) {
+ const YuvUnit low = 1 << (sizeof(YuvUnit) * 8 - 4);
+ return rgb_to_yuv_mat[0][0] * r + rgb_to_yuv_mat[0][1] * g + rgb_to_yuv_mat[0][2] * b + low;
+}
+
+template
+__device__ inline YuvUnit RgbToU(RgbUnit r, RgbUnit g, RgbUnit b) {
+ const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+ return rgb_to_yuv_mat[1][0] * r + rgb_to_yuv_mat[1][1] * g + rgb_to_yuv_mat[1][2] * b + mid;
+}
+
+template
+__device__ inline YuvUnit RgbToV(RgbUnit r, RgbUnit g, RgbUnit b) {
+ const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+ return rgb_to_yuv_mat[2][0] * r + rgb_to_yuv_mat[2][1] * g + rgb_to_yuv_mat[2][2] * b + mid;
+}
+
+template
+__global__ static void RgbaToYuvKernel(uint8_t *dp_rgb, int rgba_pitch, uint8_t *dp_yuv, int yuv_pitch, int width, int height) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+ if (x + 1 >= width || y + 1 >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_rgb + x * sizeof(Rgb) + y * rgba_pitch;
+ RgbIntx2 int2a = *(RgbIntx2 *)p_src;
+ RgbIntx2 int2b = *(RgbIntx2 *)(p_src + rgba_pitch);
+
+ Rgb rgb[4] = {int2a.x, int2a.y, int2b.x, int2b.y};
+ decltype(Rgb::c.r)
+ r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4,
+ g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4,
+ b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4;
+
+ uint8_t *p_dst = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+ *(YuvUnitx2 *)p_dst = YuvUnitx2 {
+ RgbToY(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b),
+ RgbToY(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b),
+ };
+ *(YuvUnitx2 *)(p_dst + yuv_pitch) = YuvUnitx2 {
+ RgbToY(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b),
+ RgbToY(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b),
+ };
+ *(YuvUnitx2 *)(p_dst + (height - y / 2) * yuv_pitch) = YuvUnitx2 {
+ RgbToU(r, g, b),
+ RgbToV(r, g, b),
+ };
+}
+
+void Bgra64ToP016(uint8_t *dp_bgra, int bgra_pitch, uint8_t *dp_p016, int p016_pitch, int width, int height, int col_standard) {
+ SetMatRgb2Yuv(col_standard);
+ RgbaToYuvKernel
+ <<>>
+ (dp_bgra, bgra_pitch, dp_p016, p016_pitch, width, height);
+}
+
+template
+__global__ static void RgbToYuvKernel(uint8_t *dp_rgb, int rgb_pitch, uint8_t *dp_yuv, int yuv_pitch, int width, int height) {
+ int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+ int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+ if (x + 1 >= width || y + 1 >= height) {
+ return;
+ }
+
+ uint8_t *p_src = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch;
+ RgbInt1 int1a = *(RgbInt1 *)p_src;
+ RgbInt2 int2a = *(RgbInt2 *)(p_src + sizeof(RgbInt1));
+ RgbInt1 int1b = *(RgbInt1 *)(p_src + rgb_pitch);
+ RgbInt2 int2b = *(RgbInt2 *)(p_src + rgb_pitch + sizeof(RgbInt1));
+
+ Rgb rgb[4];
+ rgb[0].v = {int1a.x, int1a.y, int1a.z},
+ rgb[1].v = {int1a.w, int2a.x, int2a.y},
+ rgb[2].v = {int1b.x, int1b.y, int1b.z},
+ rgb[3].v = {int1b.w, int2b.x, int2b.y};
+ decltype(Rgb::c.r)
+ r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4,
+ g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4,
+ b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4;
+
+ uint8_t *p_dst = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch;
+ *(YuvUnitx2 *)p_dst = YuvUnitx2 {
+ RgbToY(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b),
+ RgbToY(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b),
+ };
+ *(YuvUnitx2 *)(p_dst + yuv_pitch) = YuvUnitx2 {
+ RgbToY(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b),
+ RgbToY(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b),
+ };
+ *(YuvUnitx2 *)(p_dst + (height - y / 2) * yuv_pitch) = YuvUnitx2 {
+ RgbToU(r, g, b),
+ RgbToV(r, g, b),
+ };
+}
+
+void Bgr48ToP016(uint8_t *p_bgr, int bgr_pitch, uint8_t *dp_p016, int p016_pitch, int width, int height, int col_standard) {
+ SetMatRgb2Yuv(col_standard);
+ RgbToYuvKernel
+ <<>>
+ (p_bgr, bgr_pitch, dp_p016, p016_pitch, width, height);
+}
diff --git a/utils/colorspace_kernels.h b/utils/colorspace_kernels.h
new file mode 100644
index 0000000000..c600de8582
--- /dev/null
+++ b/utils/colorspace_kernels.h
@@ -0,0 +1,143 @@
+
+/*
+Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include
+#include
+
+/*!
+ * \file
+ * \brief The AMD Color Space Standards for VCN Decode Library.
+ *
+ * \defgroup group_amd_vcn_colorspace colorSpace: AMD VCN Color Space API
+ * \brief AMD The vcnDECODE Color Space API.
+ */
+
+typedef enum ColorSpaceStandard_ {
+ ColorSpaceStandard_BT709 = 1,
+ ColorSpaceStandard_Unspecified = 2,
+ ColorSpaceStandard_Reserved = 3,
+ ColorSpaceStandard_FCC = 4,
+ ColorSpaceStandard_BT470 = 5,
+ ColorSpaceStandard_BT601 = 6,
+ ColorSpaceStandard_SMPTE240M = 7,
+ ColorSpaceStandard_YCgCo = 8,
+ ColorSpaceStandard_BT2020 = 9,
+ ColorSpaceStandard_BT2020C = 10
+} ColorSpaceStandard;
+
+union BGR24 {
+ uchar3 v;
+ struct {
+ uint8_t b, g, r;
+ } c;
+};
+
+union RGB24 {
+ uchar3 v;
+ struct {
+ uint8_t r, g, b;
+ } c;
+};
+
+union BGR48 {
+ ushort3 v;
+ struct {
+ uint16_t b, g, r;
+ } c;
+};
+
+union RGB48 {
+ ushort3 v;
+ struct {
+ uint16_t r, g, b;
+ } c;
+};
+
+union BGRA32 {
+ uint32_t d;
+ uchar4 v;
+ struct {
+ uint8_t b, g, r, a;
+ } c;
+};
+
+union RGBA32 {
+ uint32_t d;
+ uchar4 v;
+ struct {
+ uint8_t r, g, b, a;
+ } c;
+};
+
+union BGRA64 {
+ uint64_t d;
+ ushort4 v;
+ struct {
+ uint16_t b, g, r, a;
+ } c;
+};
+
+union RGBA64 {
+ uint64_t d;
+ ushort4 v;
+ struct {
+ uint16_t r, g, b, a;
+ } c;
+};
+
+// color-convert hip kernel function definitions
+template
+void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+
+template
+void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+template
+void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard);
+
diff --git a/utils/rocvideodecode/commons.h b/utils/rocvideodecode/commons.h
deleted file mode 100644
index 46aca6ba8f..0000000000
--- a/utils/rocvideodecode/commons.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-Copyright (c) 2023 - 2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-#include
-#include
-#include
-#include
-
-#define TOSTR(X) std::to_string(static_cast(X))
-#define STR(X) std::string(X)
-
-#if DBGINFO
-#define INFO(X) std::clog << "[INF] " << " {" << __func__ <<"} " << " " << X << std::endl;
-#else
-#define INFO(X) ;
-#endif
-#define ERR(X) std::cerr << "[ERR] " << " {" << __func__ <<"} " << " " << X << std::endl;
-
-
-class rocVideoDecodeException : public std::exception {
-public:
-
- explicit rocVideoDecodeException(const std::string& message):_message(message){}
- virtual const char* what() const throw() override {
- return _message.c_str();
- }
-private:
- std::string _message;
-};
-
-#define THROW(X) throw rocVideoDecodeException(" { "+std::string(__func__)+" } " + X);
diff --git a/utils/rocvideodecode/roc_video_dec.h b/utils/rocvideodecode/roc_video_dec.h
index 00a16058b5..ecb9353273 100644
--- a/utils/rocvideodecode/roc_video_dec.h
+++ b/utils/rocvideodecode/roc_video_dec.h
@@ -158,6 +158,9 @@ class RocVideoDecoder {
~RocVideoDecoder();
rocDecVideoCodec GetCodecId() { return codec_id_; }
+
+ hipStream_t GetStream() {return hip_stream_;}
+
/**
* @brief Get the output frame width
*/