From bd5af8b66cfb361d29cf6b15975db598553d326b Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Tue, 16 Apr 2024 18:27:22 -0400 Subject: [PATCH] Add support for ROCJPEG_OUTPUT_RGB_PLANAR output format (#15) * Add support for ROCJPEG_OUTPUT_RGB_PLANAR output format * update the jpegdecode sample based on the review comments * use make_float3 when it's possible [ROCm/rocjpeg commit: c591c342aa6e51340d90dd95b90b121f2040df12] --- projects/rocjpeg/api/rocjpeg.h | 6 +- .../rocjpeg/docs/how-to/using-rocjpeg.rst | 5 +- projects/rocjpeg/samples/CMakeLists.txt | 20 +- .../rocjpeg/samples/jpegDecode/jpegdecode.cpp | 54 +- projects/rocjpeg/src/rocjpeg_decoder.cpp | 101 +- projects/rocjpeg/src/rocjpeg_decoder.h | 4 +- projects/rocjpeg/src/rocjpeg_hip_kernels.cpp | 923 +++++++++++++++--- projects/rocjpeg/src/rocjpeg_hip_kernels.h | 17 + .../rocjpeg/src/rocjpeg_vaapi_decoder.cpp | 11 +- projects/rocjpeg/test/CMakeLists.txt | 20 +- 10 files changed, 937 insertions(+), 224 deletions(-) diff --git a/projects/rocjpeg/api/rocjpeg.h b/projects/rocjpeg/api/rocjpeg.h index a7de25845e..41bee611b8 100644 --- a/projects/rocjpeg/api/rocjpeg.h +++ b/projects/rocjpeg/api/rocjpeg.h @@ -109,10 +109,12 @@ typedef enum { ROCJPEG_OUTPUT_YUV_PLANAR = 1, // return luma component (Y) and write to first channel of RocJpegImage ROCJPEG_OUTPUT_Y = 2, - // convert to interleaved RGB using HIP kernels and write to first channel of RocJpegImage + // convert to interleaved RGB using VCN JPEG decoder (on MI300+) or using HIP kernels and write to first channel of RocJpegImage ROCJPEG_OUTPUT_RGB = 3, + // convert to RGB PLANAR using VCN JPEG decoder (on MI300+) or HIP kernels and write to first, second, and thrid channel of RocJpegImage. + ROCJPEG_OUTPUT_RGB_PLANAR = 4, // maximum allowed value - ROCJPEG_OUTPUT_FORMAT_MAX = 4 + ROCJPEG_OUTPUT_FORMAT_MAX = 5 } RocJpegOutputFormat; /*****************************************************/ diff --git a/projects/rocjpeg/docs/how-to/using-rocjpeg.rst b/projects/rocjpeg/docs/how-to/using-rocjpeg.rst index 982db65f95..328cc0c113 100644 --- a/projects/rocjpeg/docs/how-to/using-rocjpeg.rst +++ b/projects/rocjpeg/docs/how-to/using-rocjpeg.rst @@ -113,6 +113,7 @@ You can set the ``RocJpegOutputFormat`` parameter to one of the ``output_format` "ROCJPEG_OUTPUT_YUV_PLANAR", "Return in the YUV planar format." "ROCJPEG_OUTPUT_Y", "Return the Y component only." "ROCJPEG_OUTPUT_RGB", "Convert to interleaved RGB." + "ROCJPEG_OUTPUT_RGB_PLANAR", "Convert to planar RGB." For example, if ``output_format`` is set to ``ROCJPEG_OUTPUT_NATIVE``, then based on the chroma subsampling of the input image, the ``rocJpegDecode()`` function does one of the following: @@ -123,7 +124,7 @@ For example, if ``output_format`` is set to ``ROCJPEG_OUTPUT_NATIVE``, then base * For ``ROCJPEG_CSS_400`` write Y to first channel of ``RocJpegImage``. if ``output_format`` is set to ``ROCJPEG_OUTPUT_Y`` or ``ROCJPEG_OUTPUT_RGB`` then ``rocJpegDecode()`` copies the output to first channel of ``RocJpegImage``. -Alternately, in the case of ``ROCJPEG_OUTPUT_YUV_PLANAR``, the data is written to the corresponding channels of the ``RocJpegImage`` destination structure. +Alternately, in the case of ``ROCJPEG_OUTPUT_YUV_PLANAR`` or ``ROCJPEG_OUTPUT_RGB_PLANAR``, the data is written to the corresponding channels of the ``RocJpegImage`` destination structure. The destination buffers should be large enough to be able to store output of specified format. These buffers should be pre-allocated by the user in the device memories. For each color plane (channel), sizes could be retrieved for image using ``rocJpegGetImageInfo()`` API and minimum required memory buffer for each plane is plane_height * plane_pitch where @@ -143,7 +144,7 @@ the required size for the output buffers for a single decode JPEG. To optimally "ROCJPEG_OUTPUT_YUV_PLANAR", "ROCJPEG_CSS_400", "destination.pitch[0] = widths[0]", "destination.channel[0] = destination.pitch[0] * heights[0]" "ROCJPEG_OUTPUT_Y", "Any of the supported chroma subsampling", "destination.pitch[0] = widths[0]", "destination.channel[0] = destination.pitch[0] * heights[0]" "ROCJPEG_OUTPUT_RGB", "Any of the supported chroma subsampling", "destination.pitch[0] = widths[0] * 3", "destination.channel[0] = destination.pitch[0] * heights[0]" - + "ROCJPEG_OUTPUT_RGB_PLANAR", "Any of the supported chroma subsampling", "destination.pitch[c] = widths[c] for c = 0, 1, 2", "destination.channel[c] = destination.pitch[c] * heights[c] for c = 0, 1, 2" 5. Destroy the decoder ==================================================== diff --git a/projects/rocjpeg/samples/CMakeLists.txt b/projects/rocjpeg/samples/CMakeLists.txt index fd95ff1aa0..33769b11bc 100644 --- a/projects/rocjpeg/samples/CMakeLists.txt +++ b/projects/rocjpeg/samples/CMakeLists.txt @@ -24,7 +24,7 @@ cmake_minimum_required(VERSION 3.5) add_test( NAME - jpeg-decode-fmt-unchanged + jpeg-decode-fmt-native COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/jpegDecode" @@ -36,7 +36,7 @@ add_test( add_test( NAME - jpeg-decode-fmt-yuv + jpeg-decode-fmt-yuv-planar COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/jpegDecode" @@ -60,12 +60,24 @@ add_test( add_test( NAME - jpeg-decode-fmt-rgbi + jpeg-decode-fmt-rgb COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/jpegDecode" "${CMAKE_CURRENT_BINARY_DIR}/jpegDecode" --build-generator "${CMAKE_GENERATOR}" --test-command "jpegdecode" - -i ${CMAKE_SOURCE_DIR}/data/images/ -fmt rgbi + -i ${CMAKE_SOURCE_DIR}/data/images/ -fmt rgb +) + +add_test( + NAME + jpeg-decode-fmt-rgb-planar + COMMAND + "${CMAKE_CTEST_COMMAND}" + --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/jpegDecode" + "${CMAKE_CURRENT_BINARY_DIR}/jpegDecode" + --build-generator "${CMAKE_GENERATOR}" + --test-command "jpegdecode" + -i ${CMAKE_SOURCE_DIR}/data/images/ -fmt rgb_planar ) \ No newline at end of file diff --git a/projects/rocjpeg/samples/jpegDecode/jpegdecode.cpp b/projects/rocjpeg/samples/jpegDecode/jpegdecode.cpp index 64836b1d91..236ff684b9 100644 --- a/projects/rocjpeg/samples/jpegDecode/jpegdecode.cpp +++ b/projects/rocjpeg/samples/jpegDecode/jpegdecode.cpp @@ -51,7 +51,7 @@ void ShowHelpAndExit(const char *option = NULL) { std::cout << "Options:" << std::endl << "-i Path to single image or directory of images - required" << std::endl << "-be Select rocJPEG backend (0 for ROCJPEG_BACKEND_HARDWARE, using VCN hardware-accelarated JPEG decoder, 1 ROCJPEG_BACKEND_HYBRID, using CPU and GPU HIP kernles for JPEG decoding); optional; default: 0" << std::endl - << "-fmt Select rocJPEG output format for decoding, one of the [native, yuv, y, rgb]; optional; default: native" << std::endl + << "-fmt Select rocJPEG output format for decoding, one of the [native, yuv, y, rgb, rgb_planar]; optional; default: native" << std::endl << "-o Output file path or directory - Write decoded images based on the selected outfut format to this file or directory; optional;" << std::endl << "-d GPU device id (0 for the first GPU device, 1 for the second GPU device, etc.); optional; default: 0" << std::endl; exit(0); @@ -107,6 +107,8 @@ void ParseCommandLine(std::string &input_path, std::string &output_file_path, in output_format = ROCJPEG_OUTPUT_Y; } else if (selected_output_format == "rgb") { output_format = ROCJPEG_OUTPUT_RGB; + } else if (selected_output_format == "rgb_planar") { + output_format = ROCJPEG_OUTPUT_RGB_PLANAR; } else { ShowHelpAndExit(argv[i]); } @@ -188,6 +190,10 @@ void SaveImage(std::string output_file_name, RocJpegImage *output_image, uint32_ widths[0] = img_width * 3; heights[0] = img_height; break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + widths[2] = widths[1] = widths[0] = img_width; + heights[2] = heights[1] = heights[0] = img_height; + break; default: std::cout << "Unknown output format!" << std::endl; return; @@ -297,10 +303,10 @@ int main(int argc, char **argv) { uint32_t heights[ROCJPEG_MAX_COMPONENT] = {}; uint32_t channel_sizes[ROCJPEG_MAX_COMPONENT] = {}; uint32_t num_channels = 0; - int total_images_all = 0; + int total_images = 0; double time_per_image_all = 0; - double m_pixels_all = 0; - double image_per_sec_all = 0; + double mpixels_all = 0; + double images_per_sec = 0; std::string chroma_sub_sampling = ""; std::string input_path, output_file_path; std::vector file_paths = {}; @@ -444,6 +450,11 @@ int main(int argc, char **argv) { output_image.pitch[0] = widths[0] * 3; channel_sizes[0] = output_image.pitch[0] * heights[0]; break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + num_channels = 3; + output_image.pitch[2] = output_image.pitch[1] = output_image.pitch[0] = widths[0]; + channel_sizes[2] = channel_sizes[1] = channel_sizes[0] = output_image.pitch[0] * heights[0]; + break; default: std::cout << "Unknown output format!" << std::endl; return EXIT_FAILURE; @@ -457,10 +468,8 @@ int main(int argc, char **argv) { auto start_time = std::chrono::high_resolution_clock::now(); CHECK_ROCJPEG(rocJpegDecode(rocjpeg_handle, reinterpret_cast(file_data[counter].data()), file_size, output_format, &output_image)); auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration decoder_time = end_time - start_time; - double time_per_image = decoder_time.count() * 1000; - double ips = (1 / time_per_image) * 1000; - double mpixels = ((double)widths[0] * (double)heights[0] / 1000000) * ips; + double time_per_image_in_milli_sec = std::chrono::duration(end_time - start_time).count(); + double image_size_in_mpixels = (static_cast(widths[0]) * static_cast(heights[0]) / 1000000); image_count++; if (dump_output_frames) { @@ -480,6 +489,9 @@ int main(int argc, char **argv) { case ROCJPEG_OUTPUT_RGB: file_extension = "rgb"; break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + file_extension = "rgb_planar"; + break; default: file_extension = ""; break; @@ -499,27 +511,27 @@ int main(int argc, char **argv) { } } - std::cout << "info: total decoded images: " << image_count << std::endl; - std::cout << "info: average processing time per image (ms): " << time_per_image << std::endl; - std::cout << "info: average images per sec: " << (1 / time_per_image) * 1000 << std::endl; - std::cout << "info: total elapsed time (s): " << decoder_time.count() << std::endl; + std::cout << "info: average processing time per image (ms): " << time_per_image_in_milli_sec << std::endl; + std::cout << "info: average images per sec: " << 1000 / time_per_image_in_milli_sec << std::endl; if (is_dir) { std::cout << std::endl; - total_images_all += image_count; - time_per_image_all += time_per_image; - image_per_sec_all += ips; - m_pixels_all += mpixels; + total_images += image_count; + time_per_image_all += time_per_image_in_milli_sec; + mpixels_all += image_size_in_mpixels; } counter++; } if (is_dir) { - std::cout << "info: total decoded images: " << total_images_all << std::endl; - if (total_images_all) { - std::cout << "info: average processing time per image (ms): " << time_per_image_all / total_images_all << std::endl; - std::cout << "info: average decoded images per sec: " << image_per_sec_all / total_images_all << std::endl; - std::cout << "info: average decoded mpixels per sec: " << m_pixels_all / total_images_all << std::endl; + time_per_image_all = time_per_image_all / total_images; + images_per_sec = 1000 / time_per_image_all; + double mpixels_per_sec = mpixels_all * images_per_sec / total_images; + std::cout << "info: total decoded images: " << total_images << std::endl; + if (total_images) { + std::cout << "info: average processing time per image (ms): " << time_per_image_all << std::endl; + std::cout << "info: average decoded images per sec: " << images_per_sec << std::endl; + std::cout << "info: average decoded image_size_in_mpixels per sec: " << mpixels_per_sec << std::endl; } std::cout << std::endl; } diff --git a/projects/rocjpeg/src/rocjpeg_decoder.cpp b/projects/rocjpeg/src/rocjpeg_decoder.cpp index 4f1fc41e75..629021d8d5 100644 --- a/projects/rocjpeg/src/rocjpeg_decoder.cpp +++ b/projects/rocjpeg/src/rocjpeg_decoder.cpp @@ -90,10 +90,18 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg switch (output_format) { case ROCJPEG_OUTPUT_NATIVE: - // copy the native decoded output buffers from interop memory directly to the destination buffers + // Copy the native decoded output buffers from interop memory directly to the destination buffers CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height)); - CHECK_ROCJPEG(CopyLuma(destination, jpeg_stream_params->picture_parameter_buffer.picture_height)); - CHECK_ROCJPEG(CopyChroma(destination, chroma_height)); + // Copy Luma (first channel) for any surface format + CHECK_ROCJPEG(CopyChannel(destination, jpeg_stream_params->picture_parameter_buffer.picture_height, 0)); + if (hip_interop_.surface_format == VA_FOURCC_NV12) { + // Copy the second channel (UV interleaved) for NV12 + CHECK_ROCJPEG(CopyChannel(destination, chroma_height, 1)); + } else if (hip_interop_.surface_format == VA_FOURCC_444P) { + // Copy the second and third channels for YUV444 + CHECK_ROCJPEG(CopyChannel(destination, chroma_height, 1)); + CHECK_ROCJPEG(CopyChannel(destination, chroma_height, 2)); + } break; case ROCJPEG_OUTPUT_YUV_PLANAR: CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height)); @@ -108,6 +116,10 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg CHECK_ROCJPEG(ColorConvertToRGB(jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, destination)); break; + case ROCJPEG_OUTPUT_RGB_PLANAR: + CHECK_ROCJPEG(ColorConvertToRGBPlanar(jpeg_stream_params->picture_parameter_buffer.picture_width, + jpeg_stream_params->picture_parameter_buffer.picture_height, destination)); + break; default: break; } @@ -218,40 +230,14 @@ RocJpegStatus ROCJpegDecoder::ReleaseHipInteropMem(VASurfaceID current_surface_i return ROCJPEG_STATUS_SUCCESS; } -RocJpegStatus ROCJpegDecoder::CopyLuma(RocJpegImage *destination, uint16_t picture_height) { - if (hip_interop_.pitch[0] != 0 && destination->pitch[0] != 0 && destination->channel[0] != nullptr) { - if (destination->pitch[0] == hip_interop_.pitch[0]) { - uint32_t luma_size = destination->pitch[0] * picture_height; - CHECK_HIP(hipMemcpyDtoDAsync(destination->channel[0], hip_interop_.hip_mapped_device_mem, luma_size, hip_stream_)); +RocJpegStatus ROCJpegDecoder::CopyChannel(RocJpegImage *destination, uint16_t channel_height, uint8_t channel_index) { + if (hip_interop_.pitch[channel_index] != 0 && destination->pitch[channel_index] != 0 && destination->channel[channel_index] != nullptr) { + if (destination->pitch[channel_index] == hip_interop_.pitch[channel_index]) { + uint32_t channel_size = destination->pitch[channel_index] * channel_height; + CHECK_HIP(hipMemcpyDtoDAsync(destination->channel[channel_index], hip_interop_.hip_mapped_device_mem + hip_interop_.offset[channel_index], channel_size, hip_stream_)); } else { - CHECK_HIP(hipMemcpy2DAsync(destination->channel[0], destination->pitch[0], hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0], - destination->pitch[0], picture_height, hipMemcpyDeviceToDevice, hip_stream_)); - } - } - return ROCJPEG_STATUS_SUCCESS; -} - -RocJpegStatus ROCJpegDecoder::CopyChroma(RocJpegImage *destination, uint16_t chroma_height) { - // copy channel1 - if (hip_interop_.pitch[1] != 0 && destination->pitch[1] != 0 && destination->channel[1] != nullptr) { - uint32_t chroma_size = destination->pitch[1] * chroma_height; - uint8_t *layer1_mem = hip_interop_.hip_mapped_device_mem + hip_interop_.offset[1]; - if (destination->pitch[1] == hip_interop_.pitch[1]) { - CHECK_HIP(hipMemcpyDtoDAsync(destination->channel[1], layer1_mem, chroma_size, hip_stream_)); - } else { - CHECK_HIP(hipMemcpy2DAsync(destination->channel[1], destination->pitch[1], layer1_mem, hip_interop_.pitch[1], - destination->pitch[1], chroma_height, hipMemcpyDeviceToDevice, hip_stream_)); - } - } - // copy channel2 - if (hip_interop_.pitch[2] != 0 && destination->pitch[2] != 0 && destination->channel[2] != nullptr) { - uint32_t chroma_size = destination->pitch[2] * chroma_height; - uint8_t *layer2_mem = hip_interop_.hip_mapped_device_mem + hip_interop_.offset[2]; - if (destination->pitch[2] == hip_interop_.pitch[2]) { - CHECK_HIP(hipMemcpyDtoDAsync(destination->channel[2], layer2_mem, chroma_size, hip_stream_)); - } else { - CHECK_HIP(hipMemcpy2DAsync(destination->channel[2], destination->pitch[2], layer2_mem, hip_interop_.pitch[2], - destination->pitch[2], chroma_height, hipMemcpyDeviceToDevice, hip_stream_)); + CHECK_HIP(hipMemcpy2DAsync(destination->channel[channel_index], destination->pitch[channel_index], hip_interop_.hip_mapped_device_mem + hip_interop_.offset[channel_index], hip_interop_.pitch[channel_index], + destination->pitch[channel_index], channel_height, hipMemcpyDeviceToDevice, hip_stream_)); } } return ROCJPEG_STATUS_SUCCESS; @@ -307,19 +293,53 @@ RocJpegStatus ROCJpegDecoder::ColorConvertToRGB(uint32_t picture_width, uint32_t return ROCJPEG_STATUS_SUCCESS; } +RocJpegStatus ROCJpegDecoder::ColorConvertToRGBPlanar(uint32_t picture_width, uint32_t picture_height, RocJpegImage *destination) { + switch (hip_interop_.surface_format) { + case VA_FOURCC_444P: + ColorConvertYUV444ToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0], + hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0], hip_interop_.offset[1]); + break; + case ROCJPEG_FOURCC_YUYV: + ColorConvertYUYVToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0], + hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]); + break; + case VA_FOURCC_NV12: + ColorConvertNV12ToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0], + hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0], + hip_interop_.hip_mapped_device_mem + hip_interop_.offset[1], hip_interop_.pitch[1]); + break; + case VA_FOURCC_Y800: + ColorConvertYUV400ToRGBPlanar(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0], + hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]); + break; + case VA_FOURCC_RGBP: + // Copy red, green, and blue channels from the interop memory into the destination + for (uint8_t channel_index = 0; channel_index < 3; channel_index++) { + CHECK_ROCJPEG(CopyChannel(destination, picture_height, channel_index)); + } + break; + default: + ERR("ERROR! surface format is not supported!"); + return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED; + } + return ROCJPEG_STATUS_SUCCESS; +} + RocJpegStatus ROCJpegDecoder::GetPlanarYUVOutputFormat(uint32_t picture_width, uint32_t picture_height, uint16_t chroma_height, RocJpegImage *destination) { if (hip_interop_.surface_format == ROCJPEG_FOURCC_YUYV) { // Extract the packed YUYV and copy them into the first, second, and thrid channels of the destination. ConvertPackedYUYVToPlanarYUV(hip_stream_, picture_width, picture_height, destination->channel[0], destination->channel[1], destination->channel[2], destination->pitch[0], destination->pitch[1], hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]); } else { - CHECK_ROCJPEG(CopyLuma(destination, picture_height)); + // Copy Luma + CHECK_ROCJPEG(CopyChannel(destination, picture_height, 0)); if (hip_interop_.surface_format == VA_FOURCC_NV12) { // Extract the interleaved UV channels and copy them into the second and thrid channels of the destination. ConvertInterleavedUVToPlanarUV(hip_stream_, picture_width >> 1, picture_height >> 1, destination->channel[1], destination->channel[2], destination->pitch[1], hip_interop_.hip_mapped_device_mem + hip_interop_.offset[1] , hip_interop_.pitch[1]); - } else { - CHECK_ROCJPEG(CopyChroma(destination, chroma_height)); + } else if (hip_interop_.surface_format == VA_FOURCC_444P) { + CHECK_ROCJPEG(CopyChannel(destination, chroma_height, 1)); + CHECK_ROCJPEG(CopyChannel(destination, chroma_height, 2)); } } return ROCJPEG_STATUS_SUCCESS; @@ -330,7 +350,8 @@ RocJpegStatus ROCJpegDecoder::GetYOutputFormat(uint32_t picture_width, uint32_t ExtractYFromPackedYUYV(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0], hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]); } else { - CHECK_ROCJPEG(CopyLuma(destination, picture_height)); + // Copy Luma + CHECK_ROCJPEG(CopyChannel(destination, picture_height, 0)); } return ROCJPEG_STATUS_SUCCESS; } \ No newline at end of file diff --git a/projects/rocjpeg/src/rocjpeg_decoder.h b/projects/rocjpeg/src/rocjpeg_decoder.h index 5fdeddb108..ba6e751219 100644 --- a/projects/rocjpeg/src/rocjpeg_decoder.h +++ b/projects/rocjpeg/src/rocjpeg_decoder.h @@ -57,9 +57,9 @@ class ROCJpegDecoder { RocJpegStatus GetHipInteropMem(VADRMPRIMESurfaceDescriptor &va_drm_prime_surface_desc); RocJpegStatus ReleaseHipInteropMem(VASurfaceID current_surface_id); RocJpegStatus GetChromaHeight(uint16_t picture_height, uint16_t &chroma_height); - RocJpegStatus CopyLuma(RocJpegImage *destination, uint16_t picture_height); - RocJpegStatus CopyChroma(RocJpegImage *destination, uint16_t chroma_height); + RocJpegStatus CopyChannel(RocJpegImage *destination, uint16_t channel_height, uint8_t channel_index); RocJpegStatus ColorConvertToRGB(uint32_t picture_width, uint32_t picture_height, RocJpegImage *destination); + RocJpegStatus ColorConvertToRGBPlanar(uint32_t picture_width, uint32_t picture_height, RocJpegImage *destination); RocJpegStatus GetPlanarYUVOutputFormat(uint32_t picture_width, uint32_t picture_height, uint16_t chroma_height, RocJpegImage *destination); RocJpegStatus GetYOutputFormat(uint32_t picture_width, uint32_t picture_height, RocJpegImage *destination); int num_devices_; diff --git a/projects/rocjpeg/src/rocjpeg_hip_kernels.cpp b/projects/rocjpeg/src/rocjpeg_hip_kernels.cpp index 2d9c1057fc..9b305b66c2 100644 --- a/projects/rocjpeg/src/rocjpeg_hip_kernels.cpp +++ b/projects/rocjpeg/src/rocjpeg_hip_kernels.cpp @@ -80,18 +80,14 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h DUINT6 rgb0, rgb1; float4 f; - yuv.x = hipUnpack0(y0.x); - yuv.y = hipUnpack0(u0.x); - yuv.z = hipUnpack0(v0.x); + yuv = make_float3(hipUnpack0(y0.x), hipUnpack0(u0.x), hipUnpack0(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y0.x); - yuv.y = hipUnpack1(u0.x); - yuv.z = hipUnpack1(v0.x); + yuv = make_float3(hipUnpack1(y0.x), hipUnpack1(u0.x), hipUnpack1(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -100,9 +96,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y0.x); - yuv.y = hipUnpack2(u0.x); - yuv.z = hipUnpack2(v0.x); + yuv = make_float3(hipUnpack2(y0.x), hipUnpack2(u0.x), hipUnpack2(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -111,9 +105,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h rgb0.data[1] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y0.x); - yuv.y = hipUnpack3(u0.x); - yuv.z = hipUnpack3(v0.x); + yuv = make_float3(hipUnpack3(y0.x), hipUnpack3(u0.x), hipUnpack3(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -122,18 +114,14 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.w = fmaf(cb.x, yuv.y, yuv.x); rgb0.data[2] = hipPack(f); - yuv.x = hipUnpack0(y0.y); - yuv.y = hipUnpack0(u0.y); - yuv.z = hipUnpack0(v0.y); + yuv = make_float3(hipUnpack0(y0.y), hipUnpack0(u0.y), hipUnpack0(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y0.y); - yuv.y = hipUnpack1(u0.y); - yuv.z = hipUnpack1(v0.y); + yuv = make_float3(hipUnpack1(y0.y), hipUnpack1(u0.y), hipUnpack1(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -142,9 +130,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y0.y); - yuv.y = hipUnpack2(u0.y); - yuv.z = hipUnpack2(v0.y); + yuv = make_float3(hipUnpack2(y0.y), hipUnpack2(u0.y), hipUnpack2(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -153,9 +139,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h rgb0.data[4] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y0.y); - yuv.y = hipUnpack3(u0.y); - yuv.z = hipUnpack3(v0.y); + yuv = make_float3(hipUnpack3(y0.y), hipUnpack3(u0.y), hipUnpack3(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -164,18 +148,14 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.w = fmaf(cb.x, yuv.y, yuv.x); rgb0.data[5] = hipPack(f); - yuv.x = hipUnpack0(y1.x); - yuv.y = hipUnpack0(u1.x); - yuv.z = hipUnpack0(v1.x); + yuv = make_float3(hipUnpack0(y1.x), hipUnpack0(u1.x), hipUnpack0(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y1.x); - yuv.y = hipUnpack1(u1.x); - yuv.z = hipUnpack1(v1.x); + yuv = make_float3(hipUnpack1(y1.x), hipUnpack1(u1.x), hipUnpack1(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -184,9 +164,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y1.x); - yuv.y = hipUnpack2(u1.x); - yuv.z = hipUnpack2(v1.x); + yuv = make_float3(hipUnpack2(y1.x), hipUnpack2(u1.x), hipUnpack2(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -195,9 +173,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h rgb1.data[1] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y1.x); - yuv.y = hipUnpack3(u1.x); - yuv.z = hipUnpack3(v1.x); + yuv = make_float3(hipUnpack3(y1.x), hipUnpack3(u1.x), hipUnpack3(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -206,18 +182,14 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.w = fmaf(cb.x, yuv.y, yuv.x); rgb1.data[2] = hipPack(f); - yuv.x = hipUnpack0(y1.y); - yuv.y = hipUnpack0(u1.y); - yuv.z = hipUnpack0(v1.y); + yuv = make_float3(hipUnpack0(y1.y), hipUnpack0(u1.y), hipUnpack0(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y1.y); - yuv.y = hipUnpack1(u1.y); - yuv.z = hipUnpack1(v1.y); + yuv = make_float3(hipUnpack1(y1.y), hipUnpack1(u1.y), hipUnpack1(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -226,9 +198,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y1.y); - yuv.y = hipUnpack2(u1.y); - yuv.z = hipUnpack2(v1.y); + yuv = make_float3(hipUnpack2(y1.y), hipUnpack2(u1.y), hipUnpack2(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -237,9 +207,7 @@ __global__ void ColorConvertYUV444ToRGBKernel(uint32_t dst_width, uint32_t dst_h rgb1.data[4] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y1.y); - yuv.y = hipUnpack3(u1.y); - yuv.z = hipUnpack3(v1.y); + yuv = make_float3(hipUnpack3(y1.y), hipUnpack3(u1.y), hipUnpack3(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -268,7 +236,221 @@ void ColorConvertYUV444ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds uint32_t src_yuv_image_stride_in_bytes_comp = src_yuv_image_stride_in_bytes * 2; ColorConvertYUV444ToRGBKernel<<(global_threads_x) / local_threads_x), ceil(static_cast(global_threads_y) / local_threads_y)), - dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, (uint8_t *)dst_image, + dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image, + dst_image_stride_in_bytes, dst_image_stride_in_bytes_comp, src_yuv_image, src_yuv_image + src_u_image_offset, + src_yuv_image + (src_u_image_offset * 2), src_yuv_image_stride_in_bytes, + dst_width_comp, dst_height_comp, src_yuv_image_stride_in_bytes_comp); +} + +__global__ void ColorConvertYUV444ToRGBPlanarKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + uint32_t dst_image_stride_in_bytes_comp, const uint8_t *src_y_image, const uint8_t *src_u_image, const uint8_t *src_v_image, + uint32_t src_yuv_image_stride_in_bytes, uint32_t dst_width_comp, uint32_t dst_height_comp, uint32_t src_yuv_image_stride_in_bytes_comp) { + + int32_t x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int32_t y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + if ((x < dst_width_comp) && (y < dst_height_comp)) { + uint32_t src_y0_idx = y * src_yuv_image_stride_in_bytes_comp + (x << 3); + uint32_t src_y1_idx = src_y0_idx + src_yuv_image_stride_in_bytes; + + + uint2 y0 = *((uint2 *)(&src_y_image[src_y0_idx])); + uint2 y1 = *((uint2 *)(&src_y_image[src_y1_idx])); + + uint2 u0 = *((uint2 *)(&src_u_image[src_y0_idx])); + uint2 u1 = *((uint2 *)(&src_u_image[src_y1_idx])); + + uint2 v0 = *((uint2 *)(&src_v_image[src_y0_idx])); + uint2 v1 = *((uint2 *)(&src_v_image[src_y1_idx])); + + uint32_t rgb0_idx = y * dst_image_stride_in_bytes_comp + (x * 8); + uint32_t rgb1_idx = rgb0_idx + dst_image_stride_in_bytes; + + float2 cr = make_float2( 0.0000f, 1.5748f); + float2 cg = make_float2(-0.1873f, -0.4681f); + float2 cb = make_float2( 1.8556f, 0.0000f); + float3 yuv; + DUINT6 rgb0, rgb1; + float4 f; + + yuv = make_float3(hipUnpack0(y0.x), hipUnpack0(u0.x), hipUnpack0(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y0.x), hipUnpack1(u0.x), hipUnpack1(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb0.data[0] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y0.x), hipUnpack2(u0.x), hipUnpack2(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb0.data[1] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y0.x), hipUnpack3(u0.x), hipUnpack3(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb0.data[2] = hipPack(f); + + yuv = make_float3(hipUnpack0(y0.y), hipUnpack0(u0.y), hipUnpack0(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y0.y), hipUnpack1(u0.y), hipUnpack1(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb0.data[3] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y0.y), hipUnpack2(u0.y), hipUnpack2(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb0.data[4] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y0.y), hipUnpack3(u0.y), hipUnpack3(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb0.data[5] = hipPack(f); + + yuv = make_float3(hipUnpack0(y1.x), hipUnpack0(u1.x), hipUnpack0(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y1.x), hipUnpack1(u1.x), hipUnpack1(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb1.data[0] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y1.x), hipUnpack2(u1.x), hipUnpack2(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb1.data[1] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y1.x), hipUnpack3(u1.x), hipUnpack3(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb1.data[2] = hipPack(f); + + yuv = make_float3(hipUnpack0(y1.y), hipUnpack0(u1.y), hipUnpack0(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y1.y), hipUnpack1(u1.y), hipUnpack1(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb1.data[3] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y1.y), hipUnpack2(u1.y), hipUnpack2(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb1.data[4] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y1.y), hipUnpack3(u1.y), hipUnpack3(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb1.data[5] = hipPack(f); + + uint2 red0, red1, green0, green1, blue0, blue1; + red0.x = hipPack(make_float4(hipUnpack0(rgb0.data[0]), hipUnpack3(rgb0.data[0]), hipUnpack2(rgb0.data[1]), hipUnpack1(rgb0.data[2]))); + red0.y = hipPack(make_float4(hipUnpack0(rgb0.data[3]), hipUnpack3(rgb0.data[3]), hipUnpack2(rgb0.data[4]), hipUnpack1(rgb0.data[5]))); + red1.x = hipPack(make_float4(hipUnpack0(rgb1.data[0]), hipUnpack3(rgb1.data[0]), hipUnpack2(rgb1.data[1]), hipUnpack1(rgb1.data[2]))); + red1.y = hipPack(make_float4(hipUnpack0(rgb1.data[3]), hipUnpack3(rgb1.data[3]), hipUnpack2(rgb1.data[4]), hipUnpack1(rgb1.data[5]))); + + green0.x = hipPack(make_float4(hipUnpack1(rgb0.data[0]), hipUnpack0(rgb0.data[1]), hipUnpack3(rgb0.data[1]), hipUnpack2(rgb0.data[2]))); + green0.y = hipPack(make_float4(hipUnpack1(rgb0.data[3]), hipUnpack0(rgb0.data[4]), hipUnpack3(rgb0.data[4]), hipUnpack2(rgb0.data[5]))); + green1.x = hipPack(make_float4(hipUnpack1(rgb1.data[0]), hipUnpack0(rgb1.data[1]), hipUnpack3(rgb1.data[1]), hipUnpack2(rgb1.data[2]))); + green1.y = hipPack(make_float4(hipUnpack1(rgb1.data[3]), hipUnpack0(rgb1.data[4]), hipUnpack3(rgb1.data[4]), hipUnpack2(rgb1.data[5]))); + + blue0.x = hipPack(make_float4(hipUnpack2(rgb0.data[0]), hipUnpack1(rgb0.data[1]), hipUnpack0(rgb0.data[2]), hipUnpack3(rgb0.data[2]))); + blue0.y = hipPack(make_float4(hipUnpack2(rgb0.data[3]), hipUnpack1(rgb0.data[4]), hipUnpack0(rgb0.data[5]), hipUnpack3(rgb0.data[5]))); + blue1.x = hipPack(make_float4(hipUnpack2(rgb1.data[0]), hipUnpack1(rgb1.data[1]), hipUnpack0(rgb1.data[2]), hipUnpack3(rgb1.data[2]))); + blue1.y = hipPack(make_float4(hipUnpack2(rgb1.data[3]), hipUnpack1(rgb1.data[4]), hipUnpack0(rgb1.data[5]), hipUnpack3(rgb1.data[5]))); + + *((uint2 *)(&dst_image_r[rgb0_idx])) = red0; + *((uint2 *)(&dst_image_r[rgb1_idx])) = red1; + *((uint2 *)(&dst_image_g[rgb0_idx])) = green0; + *((uint2 *)(&dst_image_g[rgb1_idx])) = green1; + *((uint2 *)(&dst_image_b[rgb0_idx])) = blue0; + *((uint2 *)(&dst_image_b[rgb1_idx])) = blue1; + } +} + + +void ColorConvertYUV444ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image, + uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset) { + + int32_t local_threads_x = 16; + int32_t local_threads_y = 4; + int32_t global_threads_x = (dst_width + 7) >> 3; + int32_t global_threads_y = (dst_height + 1) >> 1; + + uint32_t dst_width_comp = (dst_width + 7) / 8; + uint32_t dst_height_comp = (dst_height + 1) / 2; + uint32_t dst_image_stride_in_bytes_comp = dst_image_stride_in_bytes * 2; + uint32_t src_yuv_image_stride_in_bytes_comp = src_yuv_image_stride_in_bytes * 2; + + ColorConvertYUV444ToRGBPlanarKernel<<(global_threads_x) / local_threads_x), ceil(static_cast(global_threads_y) / local_threads_y)), + dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image_r, dst_image_g, dst_image_b, dst_image_stride_in_bytes, dst_image_stride_in_bytes_comp, src_yuv_image, src_yuv_image + src_u_image_offset, src_yuv_image + (src_u_image_offset * 2), src_yuv_image_stride_in_bytes, dst_width_comp, dst_height_comp, src_yuv_image_stride_in_bytes_comp); @@ -316,18 +498,14 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei float3 yuv; DUINT6 prgb0, prgb1; - yuv.x = hipUnpack0(py0.x); - yuv.y = hipUnpack0(pu0.x); - yuv.z = hipUnpack0(pv0.x); + yuv = make_float3(hipUnpack0(py0.x), hipUnpack0(pu0.x), hipUnpack0(pv0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(py0.x); - yuv.y = hipUnpack1(pu0.x); - yuv.z = hipUnpack1(pv0.x); + yuv = make_float3(hipUnpack1(py0.x), hipUnpack1(pu0.x), hipUnpack1(pv0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -336,9 +514,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(py0.x); - yuv.y = hipUnpack2(pu0.x); - yuv.z = hipUnpack2(pv0.x); + yuv = make_float3(hipUnpack2(py0.x), hipUnpack2(pu0.x), hipUnpack2(pv0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -347,9 +523,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei prgb0.data[1] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(py0.x); - yuv.y = hipUnpack3(pu0.x); - yuv.z = hipUnpack3(pv0.x); + yuv = make_float3(hipUnpack3(py0.x), hipUnpack3(pu0.x), hipUnpack3(pv0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -358,18 +532,14 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.w = fmaf(cb.x, yuv.y, yuv.x); prgb0.data[2] = hipPack(f); - yuv.x = hipUnpack0(py0.y); - yuv.y = hipUnpack0(pu0.y); - yuv.z = hipUnpack0(pv0.y); + yuv = make_float3(hipUnpack0(py0.y), hipUnpack0(pu0.y), hipUnpack0(pv0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(py0.y); - yuv.y = hipUnpack1(pu0.y); - yuv.z = hipUnpack1(pv0.y); + yuv = make_float3(hipUnpack1(py0.y), hipUnpack1(pu0.y), hipUnpack1(pv0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -378,9 +548,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(py0.y); - yuv.y = hipUnpack2(pu0.y); - yuv.z = hipUnpack2(pv0.y); + yuv = make_float3(hipUnpack2(py0.y), hipUnpack2(pu0.y), hipUnpack2(pv0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -389,9 +557,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei prgb0.data[4] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(py0.y); - yuv.y = hipUnpack3(pu0.y); - yuv.z = hipUnpack3(pv0.y); + yuv = make_float3(hipUnpack3(py0.y), hipUnpack3(pu0.y), hipUnpack3(pv0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -400,18 +566,14 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.w = fmaf(cb.x, yuv.y, yuv.x); prgb0.data[5] = hipPack(f); - yuv.x = hipUnpack0(py1.x); - yuv.y = hipUnpack0(pu1.x); - yuv.z = hipUnpack0(pv1.x); + yuv = make_float3(hipUnpack0(py1.x), hipUnpack0(pu1.x), hipUnpack0(pv1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(py1.x); - yuv.y = hipUnpack1(pu1.x); - yuv.z = hipUnpack1(pv1.x); + yuv = make_float3(hipUnpack1(py1.x), hipUnpack1(pu1.x), hipUnpack1(pv1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -420,9 +582,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(py1.x); - yuv.y = hipUnpack2(pu1.x); - yuv.z = hipUnpack2(pv1.x); + yuv = make_float3(hipUnpack2(py1.x), hipUnpack2(pu1.x), hipUnpack2(pv1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -431,9 +591,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei prgb1.data[1] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(py1.x); - yuv.y = hipUnpack3(pu1.x); - yuv.z = hipUnpack3(pv1.x); + yuv = make_float3(hipUnpack3(py1.x), hipUnpack3(pu1.x), hipUnpack3(pv1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -442,18 +600,14 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.w = fmaf(cb.x, yuv.y, yuv.x); prgb1.data[2] = hipPack(f); - yuv.x = hipUnpack0(py1.y); - yuv.y = hipUnpack0(pu1.y); - yuv.z = hipUnpack0(pv1.y); + yuv = make_float3(hipUnpack0(py1.y), hipUnpack0(pu1.y), hipUnpack0(pv1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(py1.y); - yuv.y = hipUnpack1(pu1.y); - yuv.z = hipUnpack1(pv1.y); + yuv = make_float3(hipUnpack1(py1.y), hipUnpack1(pu1.y), hipUnpack1(pv1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -462,9 +616,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(py1.y); - yuv.y = hipUnpack2(pu1.y); - yuv.z = hipUnpack2(pv1.y); + yuv = make_float3(hipUnpack2(py1.y), hipUnpack2(pu1.y), hipUnpack2(pv1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -473,9 +625,7 @@ __global__ void ColorConvertYUYVToRGBKernel(uint32_t dst_width, uint32_t dst_hei prgb1.data[4] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(py1.y); - yuv.y = hipUnpack3(pu1.y); - yuv.z = hipUnpack3(pv1.y); + yuv = make_float3(hipUnpack3(py1.y), hipUnpack3(pu1.y), hipUnpack3(pv1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -503,7 +653,229 @@ void ColorConvertYUYVToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_ uint32_t src_image_stride_in_bytes_comp = src_image_stride_in_bytes * 2; ColorConvertYUYVToRGBKernel<<(global_threads_x) / local_threads_x), ceil(static_cast(global_threads_y) / local_threads_y)), - dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, (uint8_t *)dst_image, + dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image, + dst_image_stride_in_bytes, dst_image_stride_in_bytes_comp, src_image, src_image_stride_in_bytes, + src_image_stride_in_bytes_comp, dst_width_comp, dst_height_comp); +} + +__global__ void ColorConvertYUYVToRGBPlanarKernel(uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, uint32_t dst_image_stride_in_bytes_comp, + const uint8_t *src_image, uint32_t src_image_stride_in_bytes, uint32_t src_image_stride_in_bytes_comp, + uint32_t dst_width_comp, uint32_t dst_height_comp) { + int32_t x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int32_t y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + if ((x < dst_width_comp) && (y < dst_height_comp)) { + uint32_t l0_idx = y * src_image_stride_in_bytes_comp + (x << 4); + uint32_t l1_idx = l0_idx + src_image_stride_in_bytes; + uint4 l0 = *((uint4 *)(&src_image[l0_idx])); + uint4 l1 = *((uint4 *)(&src_image[l1_idx])); + + uint32_t rgb0_idx = y * dst_image_stride_in_bytes_comp + (x * 8); + uint32_t rgb1_idx = rgb0_idx + dst_image_stride_in_bytes; + + float4 f; + + uint2 py0, py1; + uint2 pu0, pu1; + uint2 pv0, pv1; + + py0.x = hipPack(make_float4(hipUnpack0(l0.x), hipUnpack2(l0.x), hipUnpack0(l0.y), hipUnpack2(l0.y))); + py0.y = hipPack(make_float4(hipUnpack0(l0.z), hipUnpack2(l0.z), hipUnpack0(l0.w), hipUnpack2(l0.w))); + py1.x = hipPack(make_float4(hipUnpack0(l1.x), hipUnpack2(l1.x), hipUnpack0(l1.y), hipUnpack2(l1.y))); + py1.y = hipPack(make_float4(hipUnpack0(l1.z), hipUnpack2(l1.z), hipUnpack0(l1.w), hipUnpack2(l1.w))); + pu0.x = hipPack(make_float4(hipUnpack1(l0.x), hipUnpack1(l0.x), hipUnpack1(l0.y), hipUnpack1(l0.y))); + pu0.y = hipPack(make_float4(hipUnpack1(l0.z), hipUnpack1(l0.z), hipUnpack1(l0.w), hipUnpack1(l0.w))); + pu1.x = hipPack(make_float4(hipUnpack1(l1.x), hipUnpack1(l1.x), hipUnpack1(l1.y), hipUnpack1(l1.y))); + pu1.y = hipPack(make_float4(hipUnpack1(l1.z), hipUnpack1(l1.z), hipUnpack1(l1.w), hipUnpack1(l1.w))); + pv0.x = hipPack(make_float4(hipUnpack3(l0.x), hipUnpack3(l0.x), hipUnpack3(l0.y), hipUnpack3(l0.y))); + pv0.y = hipPack(make_float4(hipUnpack3(l0.z), hipUnpack3(l0.z), hipUnpack3(l0.w), hipUnpack3(l0.w))); + pv1.x = hipPack(make_float4(hipUnpack3(l1.x), hipUnpack3(l1.x), hipUnpack3(l1.y), hipUnpack3(l1.y))); + pv1.y = hipPack(make_float4(hipUnpack3(l1.z), hipUnpack3(l1.z), hipUnpack3(l1.w), hipUnpack3(l1.w))); + + float2 cr = make_float2( 0.0000f, 1.5748f); + float2 cg = make_float2(-0.1873f, -0.4681f); + float2 cb = make_float2( 1.8556f, 0.0000f); + float3 yuv; + DUINT6 prgb0, prgb1; + + yuv = make_float3(hipUnpack0(py0.x), hipUnpack0(pu0.x), hipUnpack0(pv0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(py0.x), hipUnpack1(pu0.x), hipUnpack1(pv0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + prgb0.data[0] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(py0.x), hipUnpack2(pu0.x), hipUnpack2(pv0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + prgb0.data[1] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(py0.x), hipUnpack3(pu0.x), hipUnpack3(pv0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + prgb0.data[2] = hipPack(f); + + yuv = make_float3(hipUnpack0(py0.y), hipUnpack0(pu0.y), hipUnpack0(pv0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(py0.y), hipUnpack1(pu0.y), hipUnpack1(pv0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + prgb0.data[3] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(py0.y), hipUnpack2(pu0.y), hipUnpack2(pv0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + prgb0.data[4] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(py0.y), hipUnpack3(pu0.y), hipUnpack3(pv0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + prgb0.data[5] = hipPack(f); + + yuv = make_float3(hipUnpack0(py1.x), hipUnpack0(pu1.x), hipUnpack0(pv1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(py1.x), hipUnpack1(pu1.x), hipUnpack1(pv1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + prgb1.data[0] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(py1.x), hipUnpack2(pu1.x), hipUnpack2(pv1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + prgb1.data[1] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(py1.x), hipUnpack3(pu1.x), hipUnpack3(pv1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + prgb1.data[2] = hipPack(f); + + yuv = make_float3(hipUnpack0(py1.y), hipUnpack0(pu1.y), hipUnpack0(pv1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(py1.y), hipUnpack1(pu1.y), hipUnpack1(pv1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + prgb1.data[3] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(py1.y), hipUnpack2(pu1.y), hipUnpack2(pv1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + prgb1.data[4] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(py1.y), hipUnpack3(pu1.y), hipUnpack3(pv1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + prgb1.data[5] = hipPack(f); + + uint2 red0, red1, green0, green1, blue0, blue1; + red0.x = hipPack(make_float4(hipUnpack0(prgb0.data[0]), hipUnpack3(prgb0.data[0]), hipUnpack2(prgb0.data[1]), hipUnpack1(prgb0.data[2]))); + red0.y = hipPack(make_float4(hipUnpack0(prgb0.data[3]), hipUnpack3(prgb0.data[3]), hipUnpack2(prgb0.data[4]), hipUnpack1(prgb0.data[5]))); + red1.x = hipPack(make_float4(hipUnpack0(prgb1.data[0]), hipUnpack3(prgb1.data[0]), hipUnpack2(prgb1.data[1]), hipUnpack1(prgb1.data[2]))); + red1.y = hipPack(make_float4(hipUnpack0(prgb1.data[3]), hipUnpack3(prgb1.data[3]), hipUnpack2(prgb1.data[4]), hipUnpack1(prgb1.data[5]))); + + green0.x = hipPack(make_float4(hipUnpack1(prgb0.data[0]), hipUnpack0(prgb0.data[1]), hipUnpack3(prgb0.data[1]), hipUnpack2(prgb0.data[2]))); + green0.y = hipPack(make_float4(hipUnpack1(prgb0.data[3]), hipUnpack0(prgb0.data[4]), hipUnpack3(prgb0.data[4]), hipUnpack2(prgb0.data[5]))); + green1.x = hipPack(make_float4(hipUnpack1(prgb1.data[0]), hipUnpack0(prgb1.data[1]), hipUnpack3(prgb1.data[1]), hipUnpack2(prgb1.data[2]))); + green1.y = hipPack(make_float4(hipUnpack1(prgb1.data[3]), hipUnpack0(prgb1.data[4]), hipUnpack3(prgb1.data[4]), hipUnpack2(prgb1.data[5]))); + + blue0.x = hipPack(make_float4(hipUnpack2(prgb0.data[0]), hipUnpack1(prgb0.data[1]), hipUnpack0(prgb0.data[2]), hipUnpack3(prgb0.data[2]))); + blue0.y = hipPack(make_float4(hipUnpack2(prgb0.data[3]), hipUnpack1(prgb0.data[4]), hipUnpack0(prgb0.data[5]), hipUnpack3(prgb0.data[5]))); + blue1.x = hipPack(make_float4(hipUnpack2(prgb1.data[0]), hipUnpack1(prgb1.data[1]), hipUnpack0(prgb1.data[2]), hipUnpack3(prgb1.data[2]))); + blue1.y = hipPack(make_float4(hipUnpack2(prgb1.data[3]), hipUnpack1(prgb1.data[4]), hipUnpack0(prgb1.data[5]), hipUnpack3(prgb1.data[5]))); + + *((uint2 *)(&dst_image_r[rgb0_idx])) = red0; + *((uint2 *)(&dst_image_r[rgb1_idx])) = red1; + *((uint2 *)(&dst_image_g[rgb0_idx])) = green0; + *((uint2 *)(&dst_image_g[rgb1_idx])) = green1; + *((uint2 *)(&dst_image_b[rgb0_idx])) = blue0; + *((uint2 *)(&dst_image_b[rgb1_idx])) = blue1; + } +} + +void ColorConvertYUYVToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_image, uint32_t src_image_stride_in_bytes) { + + int32_t local_threads_x = 16; + int32_t local_threads_y = 4; + int32_t global_threads_x = (dst_width + 7) >> 3; + int32_t global_threads_y = (dst_height + 1) >> 1; + + uint32_t dst_width_comp = (dst_width + 7) / 8; + uint32_t dst_height_comp = (dst_height + 1) / 2; + uint32_t dst_image_stride_in_bytes_comp = dst_image_stride_in_bytes * 2; + uint32_t src_image_stride_in_bytes_comp = src_image_stride_in_bytes * 2; + + ColorConvertYUYVToRGBPlanarKernel<<(global_threads_x) / local_threads_x), ceil(static_cast(global_threads_y) / local_threads_y)), + dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image_r, dst_image_g, dst_image_b, dst_image_stride_in_bytes, dst_image_stride_in_bytes_comp, src_image, src_image_stride_in_bytes, src_image_stride_in_bytes_comp, dst_width_comp, dst_height_comp); } @@ -568,18 +940,14 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei float3 yuv; DUINT6 rgb0, rgb1; - yuv.x = hipUnpack0(y0.x); - yuv.y = hipUnpack0(u0.x); - yuv.z = hipUnpack0(v0.x); + yuv = make_float3(hipUnpack0(y0.x), hipUnpack0(u0.x), hipUnpack0(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y0.x); - yuv.y = hipUnpack1(u0.x); - yuv.z = hipUnpack1(v0.x); + yuv = make_float3(hipUnpack1(y0.x), hipUnpack1(u0.x), hipUnpack1(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -588,9 +956,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y0.x); - yuv.y = hipUnpack2(u0.x); - yuv.z = hipUnpack2(v0.x); + yuv = make_float3(hipUnpack2(y0.x), hipUnpack2(u0.x), hipUnpack2(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -599,9 +965,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei rgb0.data[1] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y0.x); - yuv.y = hipUnpack3(u0.x); - yuv.z = hipUnpack3(v0.x); + yuv = make_float3(hipUnpack3(y0.x), hipUnpack3(u0.x), hipUnpack3(v0.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -610,18 +974,14 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.w = fmaf(cb.x, yuv.y, yuv.x); rgb0.data[2] = hipPack(f); - yuv.x = hipUnpack0(y0.y); - yuv.y = hipUnpack0(u0.y); - yuv.z = hipUnpack0(v0.y); + yuv = make_float3(hipUnpack0(y0.y), hipUnpack0(u0.y), hipUnpack0(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y0.y); - yuv.y = hipUnpack1(u0.y); - yuv.z = hipUnpack1(v0.y); + yuv = make_float3(hipUnpack1(y0.y), hipUnpack1(u0.y), hipUnpack1(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -630,9 +990,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y0.y); - yuv.y = hipUnpack2(u0.y); - yuv.z = hipUnpack2(v0.y); + yuv = make_float3(hipUnpack2(y0.y), hipUnpack2(u0.y), hipUnpack2(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -641,9 +999,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei rgb0.data[4] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y0.y); - yuv.y = hipUnpack3(u0.y); - yuv.z = hipUnpack3(v0.y); + yuv = make_float3(hipUnpack3(y0.y), hipUnpack3(u0.y), hipUnpack3(v0.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -652,18 +1008,14 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.w = fmaf(cb.x, yuv.y, yuv.x); rgb0.data[5] = hipPack(f); - yuv.x = hipUnpack0(y1.x); - yuv.y = hipUnpack0(u1.x); - yuv.z = hipUnpack0(v1.x); + yuv = make_float3(hipUnpack0(y1.x), hipUnpack0(u1.x), hipUnpack0(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y1.x); - yuv.y = hipUnpack1(u1.x); - yuv.z = hipUnpack1(v1.x); + yuv = make_float3(hipUnpack1(y1.x), hipUnpack1(u1.x), hipUnpack1(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -672,9 +1024,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y1.x); - yuv.y = hipUnpack2(u1.x); - yuv.z = hipUnpack2(v1.x); + yuv = make_float3(hipUnpack2(y1.x), hipUnpack2(u1.x), hipUnpack2(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -683,9 +1033,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei rgb1.data[1] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y1.x); - yuv.y = hipUnpack3(u1.x); - yuv.z = hipUnpack3(v1.x); + yuv = make_float3(hipUnpack3(y1.x), hipUnpack3(u1.x), hipUnpack3(v1.x)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -694,18 +1042,14 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.w = fmaf(cb.x, yuv.y, yuv.x); rgb1.data[2] = hipPack(f); - yuv.x = hipUnpack0(y1.y); - yuv.y = hipUnpack0(u1.y); - yuv.z = hipUnpack0(v1.y); + yuv = make_float3(hipUnpack0(y1.y), hipUnpack0(u1.y), hipUnpack0(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.x = fmaf(cr.y, yuv.z, yuv.x); f.y = fmaf(cg.x, yuv.y, yuv.x); f.y = fmaf(cg.y, yuv.z, f.y); f.z = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack1(y1.y); - yuv.y = hipUnpack1(u1.y); - yuv.z = hipUnpack1(v1.y); + yuv = make_float3(hipUnpack1(y1.y), hipUnpack1(u1.y), hipUnpack1(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.w = fmaf(cr.y, yuv.z, yuv.x); @@ -714,9 +1058,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei f.x = fmaf(cg.x, yuv.y, yuv.x); f.x = fmaf(cg.y, yuv.z, f.x); f.y = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack2(y1.y); - yuv.y = hipUnpack2(u1.y); - yuv.z = hipUnpack2(v1.y); + yuv = make_float3(hipUnpack2(y1.y), hipUnpack2(u1.y), hipUnpack2(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.z = fmaf(cr.y, yuv.z, yuv.x); @@ -725,9 +1067,7 @@ __global__ void ColorConvertNV12ToRGBKernel(uint32_t dst_width, uint32_t dst_hei rgb1.data[4] = hipPack(f); f.x = fmaf(cb.x, yuv.y, yuv.x); - yuv.x = hipUnpack3(y1.y); - yuv.y = hipUnpack3(u1.y); - yuv.z = hipUnpack3(v1.y); + yuv = make_float3(hipUnpack3(y1.y), hipUnpack3(u1.y), hipUnpack3(v1.y)); yuv.y -= 128.0f; yuv.z -= 128.0f; f.y = fmaf(cr.y, yuv.z, yuv.x); @@ -761,6 +1101,249 @@ void ColorConvertNV12ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_ src_chroma_image_stride_in_bytes, dst_width_comp, dst_height_comp, src_luma_image_stride_in_bytes_comp); } +__global__ void ColorConvertNV12ToRGBPlanarKernel(uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, uint32_t dst_image_stride_in_bytes_comp, + const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes, + const uint8_t *src_chroma_image, uint32_t src_chroma_image_stride_in_bytes, + uint32_t dst_width_comp, uint32_t dst_height_comp, uint32_t src_luma_image_stride_in_bytes_comp) { + + int32_t x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int32_t y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + if ((x < dst_width_comp) && (y < dst_height_comp)) { + uint32_t src_y0_idx = y * src_luma_image_stride_in_bytes_comp + (x << 3); + uint32_t src_y1_idx = src_y0_idx + src_luma_image_stride_in_bytes; + uint32_t src_uv_idx = y * src_chroma_image_stride_in_bytes + (x << 3); + uint2 y0 = *((uint2 *)(&src_luma_image[src_y0_idx])); + uint2 y1 = *((uint2 *)(&src_luma_image[src_y1_idx])); + uint2 uv = *((uint2 *)(&src_chroma_image[src_uv_idx])); + + uint32_t rgb0_idx = y * dst_image_stride_in_bytes_comp + (x * 8); + uint32_t rgb1_idx = rgb0_idx + dst_image_stride_in_bytes; + + float4 f; + uint2 u0, u1; + uint2 v0, v1; + + f.x = hipUnpack0(uv.x); + f.y = f.x; + f.z = hipUnpack2(uv.x); + f.w = f.z; + u0.x = hipPack(f); + + f.x = hipUnpack0(uv.y); + f.y = f.x; + f.z = hipUnpack2(uv.y); + f.w = f.z; + u0.y = hipPack(f); + + u1.x = u0.x; + u1.y = u0.y; + + f.x = hipUnpack1(uv.x); + f.y = f.x; + f.z = hipUnpack3(uv.x); + f.w = f.z; + v0.x = hipPack(f); + + f.x = hipUnpack1(uv.y); + f.y = f.x; + f.z = hipUnpack3(uv.y); + f.w = f.z; + v0.y = hipPack(f); + + v1.x = v0.x; + v1.y = v0.y; + + float2 cr = make_float2( 0.0000f, 1.5748f); + float2 cg = make_float2(-0.1873f, -0.4681f); + float2 cb = make_float2( 1.8556f, 0.0000f); + float3 yuv; + DUINT6 rgb0, rgb1; + + yuv = make_float3(hipUnpack0(y0.x), hipUnpack0(u0.x), hipUnpack0(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y0.x), hipUnpack1(u0.x), hipUnpack1(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb0.data[0] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y0.x), hipUnpack2(u0.x), hipUnpack2(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb0.data[1] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y0.x), hipUnpack3(u0.x), hipUnpack3(v0.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb0.data[2] = hipPack(f); + + yuv = make_float3(hipUnpack0(y0.y), hipUnpack0(u0.y), hipUnpack0(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y0.y), hipUnpack1(u0.y), hipUnpack1(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb0.data[3] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y0.y), hipUnpack2(u0.y), hipUnpack2(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb0.data[4] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y0.y), hipUnpack3(u0.y), hipUnpack3(v0.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb0.data[5] = hipPack(f); + + yuv = make_float3(hipUnpack0(y1.x), hipUnpack0(u1.x), hipUnpack0(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y1.x), hipUnpack1(u1.x), hipUnpack1(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb1.data[0] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y1.x), hipUnpack2(u1.x), hipUnpack2(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb1.data[1] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y1.x), hipUnpack3(u1.x), hipUnpack3(v1.x)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb1.data[2] = hipPack(f); + + yuv = make_float3(hipUnpack0(y1.y), hipUnpack0(u1.y), hipUnpack0(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.x = fmaf(cr.y, yuv.z, yuv.x); + f.y = fmaf(cg.x, yuv.y, yuv.x); + f.y = fmaf(cg.y, yuv.z, f.y); + f.z = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack1(y1.y), hipUnpack1(u1.y), hipUnpack1(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.w = fmaf(cr.y, yuv.z, yuv.x); + rgb1.data[3] = hipPack(f); + + f.x = fmaf(cg.x, yuv.y, yuv.x); + f.x = fmaf(cg.y, yuv.z, f.x); + f.y = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack2(y1.y), hipUnpack2(u1.y), hipUnpack2(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.z = fmaf(cr.y, yuv.z, yuv.x); + f.w = fmaf(cg.x, yuv.y, yuv.x); + f.w = fmaf(cg.y, yuv.z, f.w); + rgb1.data[4] = hipPack(f); + + f.x = fmaf(cb.x, yuv.y, yuv.x); + yuv = make_float3(hipUnpack3(y1.y), hipUnpack3(u1.y), hipUnpack3(v1.y)); + yuv.y -= 128.0f; + yuv.z -= 128.0f; + f.y = fmaf(cr.y, yuv.z, yuv.x); + f.z = fmaf(cg.x, yuv.y, yuv.x); + f.z = fmaf(cg.y, yuv.z, f.z); + f.w = fmaf(cb.x, yuv.y, yuv.x); + rgb1.data[5] = hipPack(f); + + uint2 red0, red1, green0, green1, blue0, blue1; + red0.x = hipPack(make_float4(hipUnpack0(rgb0.data[0]), hipUnpack3(rgb0.data[0]), hipUnpack2(rgb0.data[1]), hipUnpack1(rgb0.data[2]))); + red0.y = hipPack(make_float4(hipUnpack0(rgb0.data[3]), hipUnpack3(rgb0.data[3]), hipUnpack2(rgb0.data[4]), hipUnpack1(rgb0.data[5]))); + red1.x = hipPack(make_float4(hipUnpack0(rgb1.data[0]), hipUnpack3(rgb1.data[0]), hipUnpack2(rgb1.data[1]), hipUnpack1(rgb1.data[2]))); + red1.y = hipPack(make_float4(hipUnpack0(rgb1.data[3]), hipUnpack3(rgb1.data[3]), hipUnpack2(rgb1.data[4]), hipUnpack1(rgb1.data[5]))); + + green0.x = hipPack(make_float4(hipUnpack1(rgb0.data[0]), hipUnpack0(rgb0.data[1]), hipUnpack3(rgb0.data[1]), hipUnpack2(rgb0.data[2]))); + green0.y = hipPack(make_float4(hipUnpack1(rgb0.data[3]), hipUnpack0(rgb0.data[4]), hipUnpack3(rgb0.data[4]), hipUnpack2(rgb0.data[5]))); + green1.x = hipPack(make_float4(hipUnpack1(rgb1.data[0]), hipUnpack0(rgb1.data[1]), hipUnpack3(rgb1.data[1]), hipUnpack2(rgb1.data[2]))); + green1.y = hipPack(make_float4(hipUnpack1(rgb1.data[3]), hipUnpack0(rgb1.data[4]), hipUnpack3(rgb1.data[4]), hipUnpack2(rgb1.data[5]))); + + blue0.x = hipPack(make_float4(hipUnpack2(rgb0.data[0]), hipUnpack1(rgb0.data[1]), hipUnpack0(rgb0.data[2]), hipUnpack3(rgb0.data[2]))); + blue0.y = hipPack(make_float4(hipUnpack2(rgb0.data[3]), hipUnpack1(rgb0.data[4]), hipUnpack0(rgb0.data[5]), hipUnpack3(rgb0.data[5]))); + blue1.x = hipPack(make_float4(hipUnpack2(rgb1.data[0]), hipUnpack1(rgb1.data[1]), hipUnpack0(rgb1.data[2]), hipUnpack3(rgb1.data[2]))); + blue1.y = hipPack(make_float4(hipUnpack2(rgb1.data[3]), hipUnpack1(rgb1.data[4]), hipUnpack0(rgb1.data[5]), hipUnpack3(rgb1.data[5]))); + + *((uint2 *)(&dst_image_r[rgb0_idx])) = red0; + *((uint2 *)(&dst_image_r[rgb1_idx])) = red1; + *((uint2 *)(&dst_image_g[rgb0_idx])) = green0; + *((uint2 *)(&dst_image_g[rgb1_idx])) = green1; + *((uint2 *)(&dst_image_b[rgb0_idx])) = blue0; + *((uint2 *)(&dst_image_b[rgb1_idx])) = blue1; + } +} + +void ColorConvertNV12ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes, + const uint8_t *src_chroma_image, uint32_t src_chroma_image_stride_in_bytes) { + + int32_t local_threads_x = 16; + int32_t local_threads_y = 4; + int32_t global_threads_x = (dst_width + 7) >> 3; + int32_t global_threads_y = (dst_height + 1) >> 1; + + uint32_t dst_width_comp = (dst_width + 7) / 8; + uint32_t dst_height_comp = (dst_height + 1) / 2; + uint32_t dst_image_stride_in_bytes_comp = dst_image_stride_in_bytes * 2; + uint32_t src_luma_image_stride_in_bytes_comp = src_luma_image_stride_in_bytes * 2; + + ColorConvertNV12ToRGBPlanarKernel<<(global_threads_x) / local_threads_x), ceil(static_cast(global_threads_y) / local_threads_y)), + dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image_r, dst_image_g, dst_image_b, dst_image_stride_in_bytes, + dst_image_stride_in_bytes_comp, src_luma_image, src_luma_image_stride_in_bytes, src_chroma_image, + src_chroma_image_stride_in_bytes, dst_width_comp, dst_height_comp, src_luma_image_stride_in_bytes_comp); + +} + __global__ void ColorConvertYUV400ToRGBKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, uint32_t dst_image_stride_in_bytes_comp, const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes, @@ -842,6 +1425,54 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds } +__global__ void ColorConvertYUV400ToRGBPlanarKernel(uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, uint32_t dst_image_stride_in_bytes_comp, + const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes, + uint32_t dst_width_comp, uint32_t dst_height_comp, uint32_t src_luma_image_stride_in_bytes_comp) { + + int32_t x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int32_t y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + if ((x < dst_width_comp) && (y < dst_height_comp)) { + uint32_t src_y0_idx = y * src_luma_image_stride_in_bytes_comp + (x << 3); + uint32_t src_y1_idx = src_y0_idx + src_luma_image_stride_in_bytes; + + uint2 y0 = *((uint2 *)(&src_luma_image[src_y0_idx])); + uint2 y1 = *((uint2 *)(&src_luma_image[src_y1_idx])); + + uint32_t rgb0_idx = y * dst_image_stride_in_bytes_comp + (x * 8); + uint32_t rgb1_idx = rgb0_idx + dst_image_stride_in_bytes; + + *((uint2 *)(&dst_image_r[rgb0_idx])) = y0; + *((uint2 *)(&dst_image_r[rgb1_idx])) = y1; + *((uint2 *)(&dst_image_g[rgb0_idx])) = y0; + *((uint2 *)(&dst_image_g[rgb1_idx])) = y1; + *((uint2 *)(&dst_image_b[rgb0_idx])) = y0; + *((uint2 *)(&dst_image_b[rgb1_idx])) = y1; + } +} + +void ColorConvertYUV400ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes) { + + int32_t local_threads_x = 16; + int32_t local_threads_y = 4; + int32_t global_threads_x = (dst_width + 7) >> 3; + int32_t global_threads_y = (dst_height + 1) >> 1; + + uint32_t dst_width_comp = (dst_width + 7) / 8; + uint32_t dst_height_comp = (dst_height + 1) / 2; + uint32_t dst_image_stride_in_bytes_comp = dst_image_stride_in_bytes * 2; + uint32_t src_luma_image_stride_in_bytes_comp = src_luma_image_stride_in_bytes * 2; + + ColorConvertYUV400ToRGBPlanarKernel<<(global_threads_x) / local_threads_x), ceil(static_cast(global_threads_y) / local_threads_y)), + dim3(local_threads_x, local_threads_y), 0, stream>>>(dst_width, dst_height, dst_image_r, dst_image_g, dst_image_b, dst_image_stride_in_bytes, + dst_image_stride_in_bytes_comp, src_luma_image, src_luma_image_stride_in_bytes, dst_width_comp, dst_height_comp, + src_luma_image_stride_in_bytes_comp); + +} + __global__ void ColorConvertRGBAToRGBKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, const uint8_t *src_image, uint32_t src_image_stride_in_bytes) { diff --git a/projects/rocjpeg/src/rocjpeg_hip_kernels.h b/projects/rocjpeg/src/rocjpeg_hip_kernels.h index 8c981ebe81..9b3407be00 100644 --- a/projects/rocjpeg/src/rocjpeg_hip_kernels.h +++ b/projects/rocjpeg/src/rocjpeg_hip_kernels.h @@ -48,6 +48,23 @@ void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_ uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, const uint8_t *src_image, uint32_t src_image_stride_in_bytes); +void ColorConvertYUV444ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, const uint8_t *src_yuv_image, + uint32_t src_yuv_image_stride_in_bytes, uint32_t src_u_image_offset); + +void ColorConvertYUYVToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_image, uint32_t src_image_stride_in_bytes); + +void ColorConvertNV12ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes, + const uint8_t *src_chroma_image, uint32_t src_chroma_image_stride_in_bytes); + +void ColorConvertYUV400ToRGBPlanar(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image_r, uint8_t *dst_image_g, uint8_t *dst_image_b, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes); + void ConvertInterleavedUVToPlanarUV(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes, const uint8_t *src_image1, uint32_t src_image1_stride_in_bytes); diff --git a/projects/rocjpeg/src/rocjpeg_vaapi_decoder.cpp b/projects/rocjpeg/src/rocjpeg_vaapi_decoder.cpp index b81a6a0c01..507f1df0bc 100644 --- a/projects/rocjpeg/src/rocjpeg_vaapi_decoder.cpp +++ b/projects/rocjpeg/src/rocjpeg_vaapi_decoder.cpp @@ -224,9 +224,14 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg // If RGB output format is requested, and the HW JPEG decoder has a built-in format conversion, // set the RGB surface format and attributes to obtain the RGB output directly from the JPEG HW decoder. // otherwise set the appropriate surface format and attributes based on the chroma subsampling of the image. - if (output_format == ROCJPEG_OUTPUT_RGB && current_vcn_jpeg_spec_.can_convert_to_rgb) { - surface_format = VA_RT_FORMAT_RGB32; - surface_attrib.value.value.i = VA_FOURCC_RGBA; + if ((output_format == ROCJPEG_OUTPUT_RGB || output_format == ROCJPEG_OUTPUT_RGB_PLANAR) && current_vcn_jpeg_spec_.can_convert_to_rgb) { + if (output_format == ROCJPEG_OUTPUT_RGB) { + surface_format = VA_RT_FORMAT_RGB32; + surface_attrib.value.value.i = VA_FOURCC_RGBA; + } else if (output_format == ROCJPEG_OUTPUT_RGB_PLANAR) { + surface_format = VA_RT_FORMAT_RGBP; + surface_attrib.value.value.i = VA_FOURCC_RGBP; + } } else { switch (jpeg_stream_params->chroma_subsampling) { case CSS_444: diff --git a/projects/rocjpeg/test/CMakeLists.txt b/projects/rocjpeg/test/CMakeLists.txt index b323826c78..685d214646 100644 --- a/projects/rocjpeg/test/CMakeLists.txt +++ b/projects/rocjpeg/test/CMakeLists.txt @@ -52,7 +52,7 @@ endif() add_test( NAME - jpeg-decode-fmt-unchanged + jpeg-decode-fmt-native COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${ROCM_PATH}/share/rocjpeg/samples/jpegDecode" @@ -64,7 +64,7 @@ add_test( add_test( NAME - jpeg-decode-fmt-yuv + jpeg-decode-fmt-yuv-planar COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${ROCM_PATH}/share/rocjpeg/samples/jpegDecode" @@ -88,12 +88,24 @@ add_test( add_test( NAME - jpeg-decode-fmt-rgbi + jpeg-decode-fmt-rgb COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${ROCM_PATH}/share/rocjpeg/samples/jpegDecode" "${CMAKE_CURRENT_BINARY_DIR}/jpegDecode" --build-generator "${CMAKE_GENERATOR}" --test-command "jpegdecode" - -i ${ROCM_PATH}/share/rocjpeg/images/ -fmt rgbi + -i ${ROCM_PATH}/share/rocjpeg/images/ -fmt rgb +) + +add_test( + NAME + jpeg-decode-fmt-rgb-planar + COMMAND + "${CMAKE_CTEST_COMMAND}" + --build-and-test "${ROCM_PATH}/share/rocjpeg/samples/jpegDecode" + "${CMAKE_CURRENT_BINARY_DIR}/jpegDecode" + --build-generator "${CMAKE_GENERATOR}" + --test-command "jpegdecode" + -i ${ROCM_PATH}/share/rocjpeg/images/ -fmt rgb_planar ) \ No newline at end of file