/* Copyright (c) 2023 - 2026 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "colorspace_kernels.h" #include "roc_video_dec.h" __constant__ float yuv_to_rgb_mat[3][3]; __constant__ float rgb_to_yuv_mat[3][3]; void inline GetColMatCoefficients(int col_standard, float &wr, float &wb, int &black, int &white, int &max) { black = 16; white = 235; max = 255; switch (col_standard) { case ColorSpaceStandard_BT709: default: wr = 0.2126f; wb = 0.0722f; break; case ColorSpaceStandard_FCC: wr = 0.30f; wb = 0.11f; break; case ColorSpaceStandard_BT470: case ColorSpaceStandard_BT601: wr = 0.2990f; wb = 0.1140f; break; case ColorSpaceStandard_SMPTE240M: wr = 0.212f; wb = 0.087f; break; case ColorSpaceStandard_BT2020: case ColorSpaceStandard_BT2020C: wr = 0.2627f; wb = 0.0593f; // 10-bit only black = 64 << 6; white = 940 << 6; max = (1 << 16) - 1; break; } } void SetMatYuv2Rgb(int col_standard) { float wr, wb; int black, white, max; GetColMatCoefficients(col_standard, wr, wb, black, white, max); float mat[3][3] = { 1.0f, 0.0f, (1.0f - wr) / 0.5f, 1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr), 1.0f, (1.0f - wb) / 0.5f, 0.0f, }; for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]); } } HIP_API_CALL(hipMemcpyToSymbol(yuv_to_rgb_mat, mat, sizeof(mat))); } void SetMatRgb2Yuv(int col_standard) { float wr, wb; int black, white, max; GetColMatCoefficients(col_standard, wr, wb, black, white, max); float mat[3][3] = { wr, 1.0f - wb - wr, wb, -0.5f * wr / (1.0f - wb), -0.5f * (1 - wb - wr) / (1.0f - wb), 0.5f, 0.5f, -0.5f * (1.0f - wb - wr) / (1.0f - wr), -0.5f * wb / (1.0f - wr), }; for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { mat[i][j] = (float)(1.0 * (white - black) / max * mat[i][j]); } } HIP_API_CALL(hipMemcpyToSymbol(rgb_to_yuv_mat, mat, sizeof(mat))); } template __device__ static T Clamp(T x, T lower, T upper) { return x < lower ? lower : (x > upper ? upper : x); } template __device__ inline Rgb YuvToRgbForPixel(YuvUnit y, YuvUnit u, YuvUnit v) { const int low = 1 << (sizeof(YuvUnit) * 8 - 4), mid = 1 << (sizeof(YuvUnit) * 8 - 1); float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid; const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f; YuvUnit r = (YuvUnit)Clamp(yuv_to_rgb_mat[0][0] * fy + yuv_to_rgb_mat[0][1] * fu + yuv_to_rgb_mat[0][2] * fv, 0.0f, maxf), g = (YuvUnit)Clamp(yuv_to_rgb_mat[1][0] * fy + yuv_to_rgb_mat[1][1] * fu + yuv_to_rgb_mat[1][2] * fv, 0.0f, maxf), b = (YuvUnit)Clamp(yuv_to_rgb_mat[2][0] * fy + yuv_to_rgb_mat[2][1] * fu + yuv_to_rgb_mat[2][2] * fv, 0.0f, maxf); Rgb rgb{}; const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(rgb.c.r)) * 8; if (sizeof(YuvUnit) >= sizeof(rgb.c.r)) { rgb.c.r = r >> nShift; rgb.c.g = g >> nShift; rgb.c.b = b >> nShift; } else { rgb.c.r = r << nShift; rgb.c.g = g << nShift; rgb.c.b = b << nShift; } return rgb; } // yuv to RGBA (32/64 bit) template __global__ static void YuvToRgbaKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; if (x + 1 >= width || y + 1 >= height) { return; } uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch); YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch); *(RgbIntx2 *)p_dst = RgbIntx2 { YuvToRgbForPixel(l0.x, ch.x, ch.y).d, YuvToRgbForPixel(l0.y, ch.x, ch.y).d, }; *(RgbIntx2 *)(p_dst + rgb_pitch) = RgbIntx2 { YuvToRgbForPixel(l1.x, ch.x, ch.y).d, YuvToRgbForPixel(l1.y, ch.x, ch.y).d, }; } // yuv to RGB (24/48 bit) template __global__ static void YuvToRgbKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; if (x + 1 >= width || y + 1 >= height) { return; } uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; uint8_t *p_dst1 = p_dst + rgb_pitch; YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch); YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch); Rgb rgb0 = YuvToRgbForPixel(l0.x, ch.x, ch.y), rgb1 = YuvToRgbForPixel(l0.y, ch.x, ch.y), rgb2 = YuvToRgbForPixel(l1.x, ch.x, ch.y), rgb3 = YuvToRgbForPixel(l1.y, ch.x, ch.y); *(RgbInt1 *)p_dst = RgbInt1 { rgb0.v.x, rgb0.v.y, rgb0.v.z, rgb1.v.x }; *(RgbInt2 *)(p_dst + sizeof(RgbInt1)) = RgbInt2 { rgb1.v.y, rgb1.v.z }; *(RgbInt1 *)(p_dst1) = RgbInt1 { rgb2.v.x, rgb2.v.y, rgb2.v.z, rgb3.v.x }; *(RgbInt2 *)(p_dst1 + sizeof(RgbInt1)) = RgbInt2 { rgb3.v.y, rgb3.v.z }; } // yuv444 to RGBA (32/64 bit) template __global__ static void Yuv444ToRgbaKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y); if (x + 1 >= width || y >= height) { return; } uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch)); YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch)); *(RgbIntx2 *)p_dst = RgbIntx2{ YuvToRgbForPixel(l0.x, ch1.x, ch2.x).d, YuvToRgbForPixel(l0.y, ch1.y, ch2.y).d, }; } // yuv444 to RGB (24/48 bit) template __global__ static void Yuv444ToRgbKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgb, int rgb_pitch, int width, int height, int v_pitch) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y); if (x + 1 >= width || y >= height) { return; } uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; uint8_t *p_dst = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch)); YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch)); Rgb rgb0 = YuvToRgbForPixel(l0.x, ch1.x, ch2.x), rgb1 = YuvToRgbForPixel(l0.y, ch1.y, ch2.y); *(RgbInt1 *)p_dst = RgbInt1 { rgb0.v.x, rgb0.v.y, rgb0.v.z, rgb1.v.x }; *(RgbInt2 *)(p_dst + sizeof(RgbInt1)) = RgbInt2 { rgb1.v.y, rgb1.v.z }; } template __global__ static void YuvToRgbPlanarKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgbp, int nRgbpPitch, int width, int height, int v_pitch) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; if (x + 1 >= width || y + 1 >= height) { return; } uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; YuvUnitx2 l1 = *(YuvUnitx2 *)(p_src + yuv_pitch); YuvUnitx2 ch = *(YuvUnitx2 *)(p_src + (v_pitch - y / 2) * yuv_pitch); Rgb rgb0 = YuvToRgbForPixel(l0.x, ch.x, ch.y), rgb1 = YuvToRgbForPixel(l0.y, ch.x, ch.y), rgb2 = YuvToRgbForPixel(l1.x, ch.x, ch.y), rgb3 = YuvToRgbForPixel(l1.y, ch.x, ch.y); uint8_t *p_dst = dp_rgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch; *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.x, rgb1.v.x}; *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.x, rgb3.v.x}; p_dst += nRgbpPitch * height; *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.y, rgb1.v.y}; *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.y, rgb3.v.y}; p_dst += nRgbpPitch * height; *(RgbUnitx2 *)p_dst = RgbUnitx2 {rgb0.v.z, rgb1.v.z}; *(RgbUnitx2 *)(p_dst + nRgbpPitch) = RgbUnitx2 {rgb2.v.z, rgb3.v.z}; } template __global__ static void Yuv444ToRgbPlanarKernel(uint8_t *dp_yuv, int yuv_pitch, uint8_t *dp_rgbp, int nRgbpPitch, int width, int height, int v_pitch) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y); if (x + 1 >= width || y >= height) { return; } uint8_t *p_src = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; YuvUnitx2 l0 = *(YuvUnitx2 *)p_src; YuvUnitx2 ch1 = *(YuvUnitx2 *)(p_src + (v_pitch * yuv_pitch)); YuvUnitx2 ch2 = *(YuvUnitx2 *)(p_src + (2 * v_pitch * yuv_pitch)); Rgb rgb0 = YuvToRgbForPixel(l0.x, ch1.x, ch2.x), rgb1 = YuvToRgbForPixel(l0.y, ch1.y, ch2.y); uint8_t *p_dst = dp_rgbp + x * sizeof(RgbUnitx2) / 2 + y * nRgbpPitch; *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.x, rgb1.v.x }; p_dst += nRgbpPitch * height; *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.y, rgb1.v.y }; p_dst += nRgbpPitch * height; *(RgbUnitx2 *)p_dst = RgbUnitx2{ rgb0.v.z, rgb1.v.z }; } template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbaKernel <<>> (dp_nv12, nv12_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbaKernel <<>> (dp_nv12, nv12_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbaKernel <<>> (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbaKernel <<>> (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbaKernel <<>> (dp_p016, p016_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbaKernel <<>> (dp_p016, p016_pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbaKernel <<>> (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbaKernel <<>> (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbPlanarKernel <<>> (dp_nv12, nv12_pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); } template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbPlanarKernel <<>> (dp_p016, p016_pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); } template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbPlanarKernel <<>> (dp_yuv_444, pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); } template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbPlanarKernel <<>> (dp_yuv_444, pitch, dp_bgrp, nBgrpPitch, width, height, v_pitch); } // Explicit Instantiation: for RGB32/BGR32 and RGB64/BGR64 formats template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor32(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor64(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor32(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor64(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor32(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor64(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColorPlanar(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColorPlanar(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColorPlanar(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgrp, int nBgrpPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbKernel <<>> (dp_nv12, nv12_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbKernel <<>> (dp_nv12, nv12_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbKernel <<>> (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbKernel <<>> (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbKernel <<>> (dp_p016, p016_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); YuvToRgbKernel <<>> (dp_p016, p016_pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgra, int bgra_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbKernel <<>> (dp_yuv_444, pitch, dp_bgra, bgra_pitch, width, height, v_pitch); } template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *dp_bgr, int bgr_pitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream) { SetMatYuv2Rgb(col_standard); Yuv444ToRgbKernel <<>> (dp_yuv_444, pitch, dp_bgr, bgr_pitch, width, height, v_pitch); } // Explicit Instantiation: for RGB24/BGR24 and RGB48/BGR48 formats template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor24(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void Nv12ToColor48(uint8_t *dp_nv12, int nv12_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor24(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void P016ToColor48(uint8_t *dp_p016, int p016_pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor24(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template void YUV444P16ToColor48(uint8_t *dp_yuv_444, int pitch, uint8_t *p_bgr, int p_bgrPitch, int width, int height, int v_pitch, int col_standard, hipStream_t hip_stream); template __device__ inline YuvUnit RgbToY(RgbUnit r, RgbUnit g, RgbUnit b) { const YuvUnit low = 1 << (sizeof(YuvUnit) * 8 - 4); return rgb_to_yuv_mat[0][0] * r + rgb_to_yuv_mat[0][1] * g + rgb_to_yuv_mat[0][2] * b + low; } template __device__ inline YuvUnit RgbToU(RgbUnit r, RgbUnit g, RgbUnit b) { const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1); return rgb_to_yuv_mat[1][0] * r + rgb_to_yuv_mat[1][1] * g + rgb_to_yuv_mat[1][2] * b + mid; } template __device__ inline YuvUnit RgbToV(RgbUnit r, RgbUnit g, RgbUnit b) { const YuvUnit mid = 1 << (sizeof(YuvUnit) * 8 - 1); return rgb_to_yuv_mat[2][0] * r + rgb_to_yuv_mat[2][1] * g + rgb_to_yuv_mat[2][2] * b + mid; } template __global__ static void RgbaToYuvKernel(uint8_t *dp_rgb, int rgba_pitch, uint8_t *dp_yuv, int yuv_pitch, int width, int height) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; if (x + 1 >= width || y + 1 >= height) { return; } uint8_t *p_src = dp_rgb + x * sizeof(Rgb) + y * rgba_pitch; RgbIntx2 int2a = *(RgbIntx2 *)p_src; RgbIntx2 int2b = *(RgbIntx2 *)(p_src + rgba_pitch); Rgb rgb[4] = {int2a.x, int2a.y, int2b.x, int2b.y}; decltype(Rgb::c.r) r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4, g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4, b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4; uint8_t *p_dst = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; *(YuvUnitx2 *)p_dst = YuvUnitx2 { RgbToY(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b), RgbToY(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b), }; *(YuvUnitx2 *)(p_dst + yuv_pitch) = YuvUnitx2 { RgbToY(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b), RgbToY(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b), }; *(YuvUnitx2 *)(p_dst + (height - y / 2) * yuv_pitch) = YuvUnitx2 { RgbToU(r, g, b), RgbToV(r, g, b), }; } void Bgra64ToP016(uint8_t *dp_bgra, int bgra_pitch, uint8_t *dp_p016, int p016_pitch, int width, int height, int col_standard, hipStream_t hip_stream) { SetMatRgb2Yuv(col_standard); RgbaToYuvKernel <<>> (dp_bgra, bgra_pitch, dp_p016, p016_pitch, width, height); } template __global__ static void RgbToYuvKernel(uint8_t *dp_rgb, int rgb_pitch, uint8_t *dp_yuv, int yuv_pitch, int width, int height) { int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2; int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2; if (x + 1 >= width || y + 1 >= height) { return; } uint8_t *p_src = dp_rgb + x * sizeof(Rgb) + y * rgb_pitch; RgbInt1 int1a = *(RgbInt1 *)p_src; RgbInt2 int2a = *(RgbInt2 *)(p_src + sizeof(RgbInt1)); RgbInt1 int1b = *(RgbInt1 *)(p_src + rgb_pitch); RgbInt2 int2b = *(RgbInt2 *)(p_src + rgb_pitch + sizeof(RgbInt1)); Rgb rgb[4]; rgb[0].v = {int1a.x, int1a.y, int1a.z}, rgb[1].v = {int1a.w, int2a.x, int2a.y}, rgb[2].v = {int1b.x, int1b.y, int1b.z}, rgb[3].v = {int1b.w, int2b.x, int2b.y}; decltype(Rgb::c.r) r = (rgb[0].c.r + rgb[1].c.r + rgb[2].c.r + rgb[3].c.r) / 4, g = (rgb[0].c.g + rgb[1].c.g + rgb[2].c.g + rgb[3].c.g) / 4, b = (rgb[0].c.b + rgb[1].c.b + rgb[2].c.b + rgb[3].c.b) / 4; uint8_t *p_dst = dp_yuv + x * sizeof(YuvUnitx2) / 2 + y * yuv_pitch; *(YuvUnitx2 *)p_dst = YuvUnitx2 { RgbToY(rgb[0].c.r, rgb[0].c.g, rgb[0].c.b), RgbToY(rgb[1].c.r, rgb[1].c.g, rgb[1].c.b), }; *(YuvUnitx2 *)(p_dst + yuv_pitch) = YuvUnitx2 { RgbToY(rgb[2].c.r, rgb[2].c.g, rgb[2].c.b), RgbToY(rgb[3].c.r, rgb[3].c.g, rgb[3].c.b), }; *(YuvUnitx2 *)(p_dst + (height - y / 2) * yuv_pitch) = YuvUnitx2 { RgbToU(r, g, b), RgbToV(r, g, b), }; } void Bgr48ToP016(uint8_t *p_bgr, int bgr_pitch, uint8_t *dp_p016, int p016_pitch, int width, int height, int col_standard, hipStream_t hip_stream) { SetMatRgb2Yuv(col_standard); RgbToYuvKernel <<>> (p_bgr, bgr_pitch, dp_p016, p016_pitch, width, height); }