Add support for YUV440 (#28)

* Add support for YUV440

* Add YUV440 to RGB kernels

* code clean up
This commit is contained in:
Aryan Salmanpour
2024-05-30 22:00:07 -04:00
committed by GitHub
parent b7edd5de5b
commit d0b812bc26
11 changed files with 546 additions and 18 deletions
+1
View File
@@ -7,6 +7,7 @@ rocJPEG is a high performance JPEG decode SDK for AMD GPUs. Using the rocJPEG AP
## Supported JPEG chroma subsampling
* YUV 4:4:4
* YUV 4:4:0
* YUV 4:2:2
* YUV 4:2:0
* YUV 4:0:0
+1 -1
View File
@@ -122,7 +122,7 @@ typedef struct {
*/
typedef enum {
/**< return native unchanged decoded YUV image from the VCN JPEG deocder.
For ROCJPEG_CSS_444 write Y, U, and V to first, second, and third channels of RocJpegImage
For ROCJPEG_CSS_444 and ROCJPEG_CSS_440 write Y, U, and V to first, second, and third channels of RocJpegImage
For ROCJPEG_CSS_422 write YUYV (packed) to first channel of RocJpegImage
For ROCJPEG_CSS_420 write Y to first channel and UV (interleaved) to second channel of RocJpegImage
For ROCJPEG_CSS_400 write Y to first channel of RocJpegImage */
+4 -3
View File
@@ -103,7 +103,7 @@ list is composed of the chroma subsampling property retrieved from the JPEG imag
.. note::
The VCN hardware-accelerated JPEG decoder in AMD GPUs only supports decoding JPEG images with ``ROCJPEG_CSS_444``, ``ROCJPEG_CSS_422``,
The VCN hardware-accelerated JPEG decoder in AMD GPUs only supports decoding JPEG images with ``ROCJPEG_CSS_444``, ``ROCJPEG_CSS_440``, ``ROCJPEG_CSS_422``,
``ROCJPEG_CSS_420``, and ``ROCJPEG_CSS_400`` chroma subsampling.
6. Decode a JPEG stream
@@ -149,7 +149,7 @@ You can set the ``RocJpegOutputFormat`` parameter of the ``RocJpegDecodeParams``
For example, if ``output_format`` is set to ``ROCJPEG_OUTPUT_NATIVE``, then based on the chroma subsampling of the input image, the
``rocJpegDecode()`` function does one of the following:
* For ``ROCJPEG_CSS_444`` write Y, U, and V to first, second, and third channels of ``RocJpegImage``.
* For ``ROCJPEG_CSS_444`` and ``ROCJPEG_CSS_440`` write Y, U, and V to first, second, and third channels of ``RocJpegImage``.
* For ``ROCJPEG_CSS_422`` write YUYV (packed) to first channel of ``RocJpegImage``.
* For ``ROCJPEG_CSS_420`` write Y to first channel and UV (interleaved) to second channel of ``RocJpegImage``.
* For ``ROCJPEG_CSS_400`` write Y to first channel of ``RocJpegImage``.
@@ -168,10 +168,11 @@ the required size for the output buffers for a single decode JPEG. To optimally
:header: "output_format", "chroma subsampling", "destination.pitch[c] should be atleast:", "destination.channel[c] should be atleast:"
"ROCJPEG_OUTPUT_NATIVE", "ROCJPEG_CSS_444", "destination.pitch[c] = widths[c] for c = 0, 1, 2", "destination.channel[c] = destination.pitch[c] * heights[0] for c = 0, 1, 2"
"ROCJPEG_OUTPUT_NATIVE", "ROCJPEG_CSS_440", "destination.pitch[c] = widths[c] for c = 0, 1, 2", "destination.channel[0] = destination.pitch[0] * heights[0], destination.channel[c] = destination.pitch[c] * heights[0] / 2 for c = 1, 2"
"ROCJPEG_OUTPUT_NATIVE", "ROCJPEG_CSS_422", "destination.pitch[0] = widths[0] * 2", "destination.channel[0] = destination.pitch[0] * heights[0]"
"ROCJPEG_OUTPUT_NATIVE", "ROCJPEG_CSS_420", "destination.pitch[1] = destination.pitch[0] = widths[0]", "destination.channel[0] = destination.pitch[0] * heights[0], destination.channel[1] = destination.pitch[1] * (heights[0] >> 1)"
"ROCJPEG_OUTPUT_NATIVE", "ROCJPEG_CSS_400", "destination.pitch[0] = widths[0]", "destination.channel[0] = destination.pitch[0] * heights[0]"
"ROCJPEG_OUTPUT_YUV_PLANAR", "ROCJPEG_CSS_444, ROCJPEG_CSS_422, ROCJPEG_CSS_420", "destination.pitch[c] = widths[c] for c = 0, 1, 2", "destination.channel[c] = destination.pitch[c] * heights[c] for c = 0, 1, 2"
"ROCJPEG_OUTPUT_YUV_PLANAR", "ROCJPEG_CSS_444, ROCJPEG_CSS_440, ROCJPEG_CSS_422, ROCJPEG_CSS_420", "destination.pitch[c] = widths[c] for c = 0, 1, 2", "destination.channel[c] = destination.pitch[c] * heights[c] for c = 0, 1, 2"
"ROCJPEG_OUTPUT_YUV_PLANAR", "ROCJPEG_CSS_400", "destination.pitch[0] = widths[0]", "destination.channel[0] = destination.pitch[0] * heights[0]"
"ROCJPEG_OUTPUT_Y", "Any of the supported chroma subsampling", "destination.pitch[0] = widths[0]", "destination.channel[0] = destination.pitch[0] * heights[0]"
"ROCJPEG_OUTPUT_RGB", "Any of the supported chroma subsampling", "destination.pitch[0] = widths[0] * 3", "destination.channel[0] = destination.pitch[0] * heights[0]"
+11 -4
View File
@@ -29,6 +29,7 @@ Supported JPEG chroma subsampling
========================================
* YUV 4:4:4
* YUV 4:4:0
* YUV 4:2:2
* YUV 4:2:0
* YUV 4:0:0
@@ -45,17 +46,23 @@ Prerequisites
* `ROCm-supported hardware <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html>`_
(``gfx908`` or higher is required)
* Install ROCm 6.1.0 or later with
* Install ROCm 6.3.0 or later with
`amdgpu-install <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html>`_
* Run: ``--usecase=rocm``
* To install rocJPEG with minimum requirements, follow the :doc:`quick-start instructions <./quick-start>`
* AMD multimedia packages
* Video Acceleration API - Version `1.5.0+` - `Libva` is an implementation for VA-API
.. code:: shell
sudo apt install libva-amdgpu-dev libdrm-amdgpu1 mesa-amdgpu-va-drivers
sudo apt install libva-dev
* AMD VA Drivers
.. code:: shell
sudo apt install mesa-amdgpu-va-drivers
* CMake 3.5 or later
@@ -63,7 +70,7 @@ Prerequisites
sudo apt install cmake
* `pkg-config <https://en.wikipedia.org/wiki/Pkg-config>`_
* pkg-config
.. code:: shell
+1 -1
View File
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
std::cout << "Input file name: " << base_file_name << std::endl;
std::cout << "Input image resolution: " << widths[0] << "x" << heights[0] << std::endl;
std::cout << "Chroma subsampling: " + chroma_sub_sampling << std::endl;
if (subsampling == ROCJPEG_CSS_440 || subsampling == ROCJPEG_CSS_411) {
if (subsampling == ROCJPEG_CSS_411) {
std::cerr << "The chroma sub-sampling is not supported by VCN Hardware" << std::endl;
if (is_dir) {
std::cout << std::endl;
@@ -71,7 +71,7 @@ void ThreadFunction(std::vector<std::string>& jpegFiles, RocJpegHandle rocjpeg_h
CHECK_ROCJPEG(rocJpegStreamParse(reinterpret_cast<uint8_t *>(file_data.data()), file_size, rocjpeg_stream));
CHECK_ROCJPEG(rocJpegGetImageInfo(rocjpeg_handle, rocjpeg_stream, &num_components, &subsampling, widths, heights));
if (subsampling == ROCJPEG_CSS_440 || subsampling == ROCJPEG_CSS_411) {
if (subsampling == ROCJPEG_CSS_411) {
std::cout << "The chroma sub-sampling is not supported by VCN Hardware" << std::endl;
std::cout << "Skipping decoding file " << base_file_name << std::endl;
return;
+16
View File
@@ -276,6 +276,12 @@ public:
output_image.pitch[2] = output_image.pitch[1] = output_image.pitch[0] = widths[0];
channel_sizes[2] = channel_sizes[1] = channel_sizes[0] = output_image.pitch[0] * heights[0];
break;
case ROCJPEG_CSS_440:
num_channels = 3;
output_image.pitch[2] = output_image.pitch[1] = output_image.pitch[0] = widths[0];
channel_sizes[0] = output_image.pitch[0] * heights[0];
channel_sizes[2] = channel_sizes[1] = output_image.pitch[0] * (heights[0] >> 1);
break;
case ROCJPEG_CSS_422:
num_channels = 1;
output_image.pitch[0] = widths[0] * 2;
@@ -407,6 +413,11 @@ public:
widths[2] = widths[1] = widths[0] = img_width;
heights[2] = heights[1] = heights[0] = img_height;
break;
case ROCJPEG_CSS_440:
widths[2] = widths[1] = widths[0] = img_width;
heights[0] = img_height;
heights[2] = heights[1] = img_height >> 1;
break;
case ROCJPEG_CSS_422:
widths[0] = img_width * 2;
heights[0] = img_height;
@@ -431,6 +442,11 @@ public:
widths[2] = widths[1] = widths[0] = img_width;
heights[2] = heights[1] = heights[0] = img_height;
break;
case ROCJPEG_CSS_440:
widths[2] = widths[1] = widths[0] = img_width;
heights[0] = img_height;
heights[2] = heights[1] = img_height >> 1;
break;
case ROCJPEG_CSS_422:
widths[0] = img_width;
widths[2] = widths[1] = widths[0] >> 1;
+21 -8
View File
@@ -128,8 +128,9 @@ RocJpegStatus RocJpegDecoder::Decode(RocJpegStreamHandle jpeg_stream_handle, con
if (hip_interop_dev_mem.surface_format == VA_FOURCC_NV12) {
// Copy the second channel (UV interleaved) for NV12
CHECK_ROCJPEG(CopyChannel(hip_interop_dev_mem, chroma_height, 1, destination));
} else if (hip_interop_dev_mem.surface_format == VA_FOURCC_444P) {
// Copy the second and third channels for YUV444
} else if (hip_interop_dev_mem.surface_format == VA_FOURCC_444P ||
hip_interop_dev_mem.surface_format == VA_FOURCC_422V) {
// Copy the second and third channels for YUV444 and YUV440 (i.e., YUV422V)
CHECK_ROCJPEG(CopyChannel(hip_interop_dev_mem, chroma_height, 1, destination));
CHECK_ROCJPEG(CopyChannel(hip_interop_dev_mem, chroma_height, 2, destination));
}
@@ -194,6 +195,11 @@ RocJpegStatus RocJpegDecoder::GetImageInfo(RocJpegStreamHandle jpeg_stream_handl
widths[2] = widths[1] = widths[0];
heights[2] = heights[1] = heights[0];
break;
case CSS_440:
*subsampling = ROCJPEG_CSS_440;
widths[2] = widths[1] = widths[0];
heights[2] = heights[1] = heights[0] >> 1;
break;
case CSS_422:
*subsampling = ROCJPEG_CSS_422;
widths[2] = widths[1] = widths[0] >> 1;
@@ -214,11 +220,6 @@ RocJpegStatus RocJpegDecoder::GetImageInfo(RocJpegStreamHandle jpeg_stream_handl
widths[2] = widths[1] = widths[0] >> 2;
heights[2] = heights[1] = heights[0];
break;
case CSS_440:
*subsampling = ROCJPEG_CSS_440;
widths[2] = widths[1] = widths[0] >> 1;
heights[2] = heights[1] = heights[0] >> 1;
break;
default:
*subsampling = ROCJPEG_CSS_UNKNOWN;
break;
@@ -278,6 +279,9 @@ RocJpegStatus RocJpegDecoder::GetChromaHeight(uint32_t surface_format, uint16_t
case ROCJPEG_FOURCC_YUYV: /*YUYV: one-plane packed 8-bit YUV 4:2:2. Four bytes per pair of pixels: Y, U, Y, V*/
chroma_height = picture_height;
break;
case VA_FOURCC_422V: /*422V: three-plane 8-bit YUV 4:4:0*/
chroma_height = picture_height >> 1;
break;
default:
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
}
@@ -304,6 +308,10 @@ RocJpegStatus RocJpegDecoder::ColorConvertToRGB(HipInteropDeviceMem& hip_interop
ColorConvertYUV444ToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
hip_interop_dev_mem.hip_mapped_device_mem, hip_interop_dev_mem.pitch[0], hip_interop_dev_mem.offset[1]);
break;
case VA_FOURCC_422V:
ColorConvertYUV440ToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
hip_interop_dev_mem.hip_mapped_device_mem, hip_interop_dev_mem.pitch[0], hip_interop_dev_mem.offset[1], hip_interop_dev_mem.offset[2]);
break;
case ROCJPEG_FOURCC_YUYV:
ColorConvertYUYVToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
hip_interop_dev_mem.hip_mapped_device_mem, hip_interop_dev_mem.pitch[0]);
@@ -349,6 +357,10 @@ RocJpegStatus RocJpegDecoder::ColorConvertToRGBPlanar(HipInteropDeviceMem& hip_i
ColorConvertYUV444ToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0],
hip_interop_dev_mem.hip_mapped_device_mem, hip_interop_dev_mem.pitch[0], hip_interop_dev_mem.offset[1]);
break;
case VA_FOURCC_422V:
ColorConvertYUV440ToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0],
hip_interop_dev_mem.hip_mapped_device_mem, hip_interop_dev_mem.pitch[0], hip_interop_dev_mem.offset[1], hip_interop_dev_mem.offset[2]);
break;
case ROCJPEG_FOURCC_YUYV:
ColorConvertYUYVToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0],
hip_interop_dev_mem.hip_mapped_device_mem, hip_interop_dev_mem.pitch[0]);
@@ -404,7 +416,8 @@ RocJpegStatus RocJpegDecoder::GetPlanarYUVOutputFormat(HipInteropDeviceMem& hip_
// Extract the interleaved UV channels and copy them into the second and thrid channels of the destination.
ConvertInterleavedUVToPlanarUV(hip_stream_, picture_width >> 1, picture_height >> 1, destination->channel[1], destination->channel[2],
destination->pitch[1], hip_interop_dev_mem.hip_mapped_device_mem + hip_interop_dev_mem.offset[1] , hip_interop_dev_mem.pitch[1]);
} else if (hip_interop_dev_mem.surface_format == VA_FOURCC_444P) {
} else if (hip_interop_dev_mem.surface_format == VA_FOURCC_444P ||
hip_interop_dev_mem.surface_format == VA_FOURCC_422V) {
CHECK_ROCJPEG(CopyChannel(hip_interop_dev_mem, chroma_height, 1, destination));
CHECK_ROCJPEG(CopyChannel(hip_interop_dev_mem, chroma_height, 2, destination));
}
+431
View File
@@ -486,6 +486,437 @@ void ColorConvertYUV444ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint3
dst_width_comp, dst_height_comp, src_yuv_image_stride_in_bytes_comp);
}
__global__ void ColorConvertYUV440ToRGBKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
uint32_t dst_image_stride_in_bytes_comp, const uint8_t *src_y_image, const uint8_t *src_u_image, const uint8_t *src_v_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t dst_width_comp, uint32_t dst_height_comp, uint32_t src_yuv_image_stride_in_bytes_comp) {
int32_t x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int32_t y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
if ((x < dst_width_comp) && (y < dst_height_comp)) {
uint32_t src_y0_idx = y * src_yuv_image_stride_in_bytes_comp + (x << 3);
uint32_t src_y1_idx = src_y0_idx + src_yuv_image_stride_in_bytes;
uint32_t src_chroma_idx = y * src_yuv_image_stride_in_bytes + (x << 3);
uint2 y0 = *((uint2 *)(&src_y_image[src_y0_idx]));
uint2 y1 = *((uint2 *)(&src_y_image[src_y1_idx]));
uint2 u0 = *((uint2 *)(&src_u_image[src_chroma_idx]));
uint2 v0 = *((uint2 *)(&src_v_image[src_chroma_idx]));
uint32_t rgb0_idx = y * dst_image_stride_in_bytes_comp + (x * 24);
uint32_t rgb1_idx = rgb0_idx + dst_image_stride_in_bytes;
float2 cr = make_float2( 0.0000f, 1.5748f);
float2 cg = make_float2(-0.1873f, -0.4681f);
float2 cb = make_float2( 1.8556f, 0.0000f);
float3 yuv;
DUINT6 rgb0, rgb1;
float4 f;
yuv = make_float3(hipUnpack0(y0.x), hipUnpack0(u0.x), hipUnpack0(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y0.x), hipUnpack1(u0.x), hipUnpack1(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb0.data[0] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y0.x), hipUnpack2(u0.x), hipUnpack2(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb0.data[1] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y0.x), hipUnpack3(u0.x), hipUnpack3(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb0.data[2] = hipPack(f);
yuv = make_float3(hipUnpack0(y0.y), hipUnpack0(u0.y), hipUnpack0(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y0.y), hipUnpack1(u0.y), hipUnpack1(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb0.data[3] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y0.y), hipUnpack2(u0.y), hipUnpack2(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb0.data[4] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y0.y), hipUnpack3(u0.y), hipUnpack3(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb0.data[5] = hipPack(f);
yuv = make_float3(hipUnpack0(y1.x), hipUnpack0(u0.x), hipUnpack0(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y1.x), hipUnpack1(u0.x), hipUnpack1(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb1.data[0] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y1.x), hipUnpack2(u0.x), hipUnpack2(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb1.data[1] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y1.x), hipUnpack3(u0.x), hipUnpack3(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb1.data[2] = hipPack(f);
yuv = make_float3(hipUnpack0(y1.y), hipUnpack0(u0.y), hipUnpack0(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y1.y), hipUnpack1(u0.y), hipUnpack1(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb1.data[3] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y1.y), hipUnpack2(u0.y), hipUnpack2(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb1.data[4] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y1.y), hipUnpack3(u0.y), hipUnpack3(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb1.data[5] = hipPack(f);
*((DUINT6 *)(&dst_image[rgb0_idx])) = rgb0;
*((DUINT6 *)(&dst_image[rgb1_idx])) = rgb1;
}
}
/**
* @brief Converts YUV440 image to RGB image.
*
* This function takes a YUV440 image and converts it to an RGB image using the ColorConvertYUV444ToRGBKernel HIP kernel.
*
* @param stream The HIP stream used for asynchronous execution of the kernel.
* @param dst_width The width of the destination RGB image.
* @param dst_height The height of the destination RGB image.
* @param dst_image Pointer to the destination RGB image buffer.
* @param dst_image_stride_in_bytes The stride (in bytes) of the destination RGB image buffer.
* @param src_yuv_image Pointer to the source YUV440 image buffer.
* @param src_yuv_image_stride_in_bytes The stride (in bytes) of the source YUV440 image buffer.
* @param src_u_image_offset The offset (in bytes) to the U component in the source YUV440 image buffer.
*/
void ColorConvertYUV440ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset, uint32_t src_v_image_offset) {
int32_t local_threads_x = 16;
int32_t local_threads_y = 4;
int32_t global_threads_x = (dst_width + 7) >> 3;
int32_t global_threads_y = (dst_height + 1) >> 1;
uint32_t dst_width_comp = (dst_width + 7) / 8;
uint32_t dst_height_comp = (dst_height + 1) / 2;
uint32_t dst_image_stride_in_bytes_comp = dst_image_stride_in_bytes * 2;
uint32_t src_yuv_image_stride_in_bytes_comp = src_yuv_image_stride_in_bytes * 2;
ColorConvertYUV440ToRGBKernel<<<dim3(ceil(static_cast<float>(global_threads_x) / local_threads_x), ceil(static_cast<float>(global_threads_y) / local_threads_y)),
dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image,
dst_image_stride_in_bytes, dst_image_stride_in_bytes_comp, src_yuv_image, src_yuv_image + src_u_image_offset,
src_yuv_image + src_v_image_offset, src_yuv_image_stride_in_bytes,
dst_width_comp, dst_height_comp, src_yuv_image_stride_in_bytes_comp);
}
__global__ void ColorConvertYUV440ToRGBPlanarKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes,
uint32_t dst_image_stride_in_bytes_comp, const uint8_t *src_y_image, const uint8_t *src_u_image, const uint8_t *src_v_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t dst_width_comp, uint32_t dst_height_comp, uint32_t src_yuv_image_stride_in_bytes_comp) {
int32_t x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int32_t y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
if ((x < dst_width_comp) && (y < dst_height_comp)) {
uint32_t src_y0_idx = y * src_yuv_image_stride_in_bytes_comp + (x << 3);
uint32_t src_y1_idx = src_y0_idx + src_yuv_image_stride_in_bytes;
uint32_t src_chroma_idx = y * src_yuv_image_stride_in_bytes + (x << 3);
uint2 y0 = *((uint2 *)(&src_y_image[src_y0_idx]));
uint2 y1 = *((uint2 *)(&src_y_image[src_y1_idx]));
uint2 u0 = *((uint2 *)(&src_u_image[src_chroma_idx]));
uint2 v0 = *((uint2 *)(&src_v_image[src_chroma_idx]));
uint32_t rgb0_idx = y * dst_image_stride_in_bytes_comp + (x * 8);
uint32_t rgb1_idx = rgb0_idx + dst_image_stride_in_bytes;
float2 cr = make_float2( 0.0000f, 1.5748f);
float2 cg = make_float2(-0.1873f, -0.4681f);
float2 cb = make_float2( 1.8556f, 0.0000f);
float3 yuv;
DUINT6 rgb0, rgb1;
float4 f;
yuv = make_float3(hipUnpack0(y0.x), hipUnpack0(u0.x), hipUnpack0(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y0.x), hipUnpack1(u0.x), hipUnpack1(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb0.data[0] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y0.x), hipUnpack2(u0.x), hipUnpack2(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb0.data[1] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y0.x), hipUnpack3(u0.x), hipUnpack3(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb0.data[2] = hipPack(f);
yuv = make_float3(hipUnpack0(y0.y), hipUnpack0(u0.y), hipUnpack0(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y0.y), hipUnpack1(u0.y), hipUnpack1(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb0.data[3] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y0.y), hipUnpack2(u0.y), hipUnpack2(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb0.data[4] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y0.y), hipUnpack3(u0.y), hipUnpack3(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb0.data[5] = hipPack(f);
yuv = make_float3(hipUnpack0(y1.x), hipUnpack0(u0.x), hipUnpack0(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y1.x), hipUnpack1(u0.x), hipUnpack1(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb1.data[0] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y1.x), hipUnpack2(u0.x), hipUnpack2(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb1.data[1] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y1.x), hipUnpack3(u0.x), hipUnpack3(v0.x));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb1.data[2] = hipPack(f);
yuv = make_float3(hipUnpack0(y1.y), hipUnpack0(u0.y), hipUnpack0(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.x = fmaf(cr.y, yuv.z, yuv.x);
f.y = fmaf(cg.x, yuv.y, yuv.x);
f.y = fmaf(cg.y, yuv.z, f.y);
f.z = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack1(y1.y), hipUnpack1(u0.y), hipUnpack1(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.w = fmaf(cr.y, yuv.z, yuv.x);
rgb1.data[3] = hipPack(f);
f.x = fmaf(cg.x, yuv.y, yuv.x);
f.x = fmaf(cg.y, yuv.z, f.x);
f.y = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack2(y1.y), hipUnpack2(u0.y), hipUnpack2(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.z = fmaf(cr.y, yuv.z, yuv.x);
f.w = fmaf(cg.x, yuv.y, yuv.x);
f.w = fmaf(cg.y, yuv.z, f.w);
rgb1.data[4] = hipPack(f);
f.x = fmaf(cb.x, yuv.y, yuv.x);
yuv = make_float3(hipUnpack3(y1.y), hipUnpack3(u0.y), hipUnpack3(v0.y));
yuv.y -= 128.0f;
yuv.z -= 128.0f;
f.y = fmaf(cr.y, yuv.z, yuv.x);
f.z = fmaf(cg.x, yuv.y, yuv.x);
f.z = fmaf(cg.y, yuv.z, f.z);
f.w = fmaf(cb.x, yuv.y, yuv.x);
rgb1.data[5] = hipPack(f);
uint2 red0, red1, green0, green1, blue0, blue1;
red0.x = hipPack(make_float4(hipUnpack0(rgb0.data[0]), hipUnpack3(rgb0.data[0]), hipUnpack2(rgb0.data[1]), hipUnpack1(rgb0.data[2])));
red0.y = hipPack(make_float4(hipUnpack0(rgb0.data[3]), hipUnpack3(rgb0.data[3]), hipUnpack2(rgb0.data[4]), hipUnpack1(rgb0.data[5])));
red1.x = hipPack(make_float4(hipUnpack0(rgb1.data[0]), hipUnpack3(rgb1.data[0]), hipUnpack2(rgb1.data[1]), hipUnpack1(rgb1.data[2])));
red1.y = hipPack(make_float4(hipUnpack0(rgb1.data[3]), hipUnpack3(rgb1.data[3]), hipUnpack2(rgb1.data[4]), hipUnpack1(rgb1.data[5])));
green0.x = hipPack(make_float4(hipUnpack1(rgb0.data[0]), hipUnpack0(rgb0.data[1]), hipUnpack3(rgb0.data[1]), hipUnpack2(rgb0.data[2])));
green0.y = hipPack(make_float4(hipUnpack1(rgb0.data[3]), hipUnpack0(rgb0.data[4]), hipUnpack3(rgb0.data[4]), hipUnpack2(rgb0.data[5])));
green1.x = hipPack(make_float4(hipUnpack1(rgb1.data[0]), hipUnpack0(rgb1.data[1]), hipUnpack3(rgb1.data[1]), hipUnpack2(rgb1.data[2])));
green1.y = hipPack(make_float4(hipUnpack1(rgb1.data[3]), hipUnpack0(rgb1.data[4]), hipUnpack3(rgb1.data[4]), hipUnpack2(rgb1.data[5])));
blue0.x = hipPack(make_float4(hipUnpack2(rgb0.data[0]), hipUnpack1(rgb0.data[1]), hipUnpack0(rgb0.data[2]), hipUnpack3(rgb0.data[2])));
blue0.y = hipPack(make_float4(hipUnpack2(rgb0.data[3]), hipUnpack1(rgb0.data[4]), hipUnpack0(rgb0.data[5]), hipUnpack3(rgb0.data[5])));
blue1.x = hipPack(make_float4(hipUnpack2(rgb1.data[0]), hipUnpack1(rgb1.data[1]), hipUnpack0(rgb1.data[2]), hipUnpack3(rgb1.data[2])));
blue1.y = hipPack(make_float4(hipUnpack2(rgb1.data[3]), hipUnpack1(rgb1.data[4]), hipUnpack0(rgb1.data[5]), hipUnpack3(rgb1.data[5])));
*((uint2 *)(&dst_image_r[rgb0_idx])) = red0;
*((uint2 *)(&dst_image_r[rgb1_idx])) = red1;
*((uint2 *)(&dst_image_g[rgb0_idx])) = green0;
*((uint2 *)(&dst_image_g[rgb1_idx])) = green1;
*((uint2 *)(&dst_image_b[rgb0_idx])) = blue0;
*((uint2 *)(&dst_image_b[rgb1_idx])) = blue1;
}
}
/**
* @brief Converts YUV440 image to RGB planar format.
*
* This function takes a YUV440 image and converts it to RGB planar format using the ColorConvertYUV444ToRGBPlanarKernel HIP kernel.
*
* @param stream The HIP stream to be used for the kernel launch.
* @param dst_width The width of the destination RGB image.
* @param dst_height The height of the destination RGB image.
* @param dst_image_r Pointer to the destination red channel image.
* @param dst_image_g Pointer to the destination green channel image.
* @param dst_image_b Pointer to the destination blue channel image.
* @param dst_image_stride_in_bytes The stride (in bytes) of the destination image.
* @param src_yuv_image Pointer to the source YUV image.
* @param src_yuv_image_stride_in_bytes The stride (in bytes) of the source YUV image.
* @param src_u_image_offset The offset (in bytes) to the U channel in the source YUV image.
*/
void ColorConvertYUV440ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset, uint32_t src_v_image_offset) {
int32_t local_threads_x = 16;
int32_t local_threads_y = 4;
int32_t global_threads_x = (dst_width + 7) >> 3;
int32_t global_threads_y = (dst_height + 1) >> 1;
uint32_t dst_width_comp = (dst_width + 7) / 8;
uint32_t dst_height_comp = (dst_height + 1) / 2;
uint32_t dst_image_stride_in_bytes_comp = dst_image_stride_in_bytes * 2;
uint32_t src_yuv_image_stride_in_bytes_comp = src_yuv_image_stride_in_bytes * 2;
ColorConvertYUV440ToRGBPlanarKernel<<<dim3(ceil(static_cast<float>(global_threads_x) / local_threads_x), ceil(static_cast<float>(global_threads_y) / local_threads_y)),
dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image_r, dst_image_g, dst_image_b,
dst_image_stride_in_bytes, dst_image_stride_in_bytes_comp, src_yuv_image, src_yuv_image + src_u_image_offset,
src_yuv_image + src_v_image_offset, src_yuv_image_stride_in_bytes,
dst_width_comp, dst_height_comp, src_yuv_image_stride_in_bytes_comp);
}
__global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, uint32_t dst_image_stride_in_bytes_comp,
const uint8_t *src_image, uint32_t src_image_stride_in_bytes, uint32_t src_image_stride_in_bytes_comp,
+55
View File
@@ -45,6 +45,24 @@ void ColorConvertYUV444ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset);
/**
* @brief Converts YUV440 image to RGB image.
*
* This function takes a YUV440 image and converts it to an RGB image.
*
* @param stream The HIP stream to be used for the conversion.
* @param dst_width The width of the destination RGB image.
* @param dst_height The height of the destination RGB image.
* @param dst_image Pointer to the destination RGB image buffer.
* @param dst_image_stride_in_bytes The stride (in bytes) of the destination RGB image buffer.
* @param src_yuv_image Pointer to the source YUV440 image buffer.
* @param src_yuv_image_stride_in_bytes The stride (in bytes) of the source YUV440 image buffer.
* @param src_u_image_offset The offset (in bytes) of the U component in the source YUV440 image buffer.
*/
void ColorConvertYUV440ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset, uint32_t src_v_image_offset);
/**
* @brief Converts an image in YUYV format to RGB format.
*
@@ -140,6 +158,43 @@ void ColorConvertYUV444ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint3
uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset);
/**
* @brief Converts YUV440 image to RGB planar format.
*
* This function takes a YUV440 image and converts it to RGB planar format.
* The resulting RGB image is stored in separate R, G, and B planes.
*
* @param stream The HIP stream to be used for the kernel execution.
* @param dst_width The width of the destination RGB image.
* @param dst_height The height of the destination RGB image.
* @param dst_image_r Pointer to the destination R plane of the RGB image.
* @param dst_image_g Pointer to the destination G plane of the RGB image.
* @param dst_image_b Pointer to the destination B plane of the RGB image.
* @param dst_image_stride_in_bytes The stride (in bytes) of the destination RGB image.
* @param src_yuv_image Pointer to the source YUV440 image.
* @param src_yuv_image_stride_in_bytes The stride (in bytes) of the source YUV440 image.
* @param src_u_image_offset The offset (in bytes) of the U plane in the source YUV440 image.
*/
void ColorConvertYUV440ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image,
uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset, uint32_t src_v_image_offset);
/**
* Converts a YUYV image to RGB planar format.
*
* This function takes a YUYV image and converts it to RGB planar format. The resulting RGB image
* is stored in separate planes for red, green, and blue channels.
*
* @param stream The HIP stream to use for the conversion.
* @param dst_width The width of the destination RGB image.
* @param dst_height The height of the destination RGB image.
* @param dst_image_r Pointer to the destination red channel plane.
* @param dst_image_g Pointer to the destination green channel plane.
* @param dst_image_b Pointer to the destination blue channel plane.
* @param dst_image_stride_in_bytes The stride (in bytes) between consecutive rows in the destination image planes.
* @param src_image Pointer to the source YUYV image.
* @param src_image_stride_in_bytes The stride (in bytes) between consecutive rows in the source image.
*/
void ColorConvertYUYVToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes,
const uint8_t *src_image, uint32_t src_image_stride_in_bytes);
+4
View File
@@ -541,6 +541,10 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg
surface_format = VA_RT_FORMAT_YUV444;
surface_attrib.value.value.i = VA_FOURCC_444P;
break;
case CSS_440:
surface_format = VA_RT_FORMAT_YUV422;
surface_attrib.value.value.i = VA_FOURCC_422V;
break;
case CSS_422:
surface_format = VA_RT_FORMAT_YUV422;
surface_attrib.value.value.i = ROCJPEG_FOURCC_YUYV;