diff --git a/src/rocjpeg_decoder.cpp b/src/rocjpeg_decoder.cpp index e707724a60..4f1fc41e75 100644 --- a/src/rocjpeg_decoder.cpp +++ b/src/rocjpeg_decoder.cpp @@ -78,7 +78,7 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg const JpegStreamParameters *jpeg_stream_params = jpeg_parser_.GetJpegStreamParameters(); VASurfaceID current_surface_id; - CHECK_ROCJPEG(jpeg_vaapi_decoder_.SubmitDecode(jpeg_stream_params, current_surface_id)); + CHECK_ROCJPEG(jpeg_vaapi_decoder_.SubmitDecode(jpeg_stream_params, current_surface_id, output_format)); if (destination != nullptr) { VADRMPRIMESurfaceDescriptor va_drm_prime_surface_desc = {}; @@ -87,15 +87,16 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg CHECK_ROCJPEG(GetHipInteropMem(va_drm_prime_surface_desc)); uint16_t chroma_height = 0; - CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height)); switch (output_format) { case ROCJPEG_OUTPUT_NATIVE: // copy the native decoded output buffers from interop memory directly to the destination buffers + CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height)); CHECK_ROCJPEG(CopyLuma(destination, jpeg_stream_params->picture_parameter_buffer.picture_height)); CHECK_ROCJPEG(CopyChroma(destination, chroma_height)); break; case ROCJPEG_OUTPUT_YUV_PLANAR: + CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height)); CHECK_ROCJPEG(GetPlanarYUVOutputFormat(jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height, destination)); break; @@ -295,6 +296,10 @@ RocJpegStatus ROCJpegDecoder::ColorConvertToRGB(uint32_t picture_width, uint32_t ColorConvertYUV400ToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0], hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]); break; + case VA_FOURCC_RGBA: + ColorConvertRGBAToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0], + hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]); + break; default: ERR("ERROR! surface format is not supported!"); return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED; diff --git a/src/rocjpeg_hip_kernels.cpp b/src/rocjpeg_hip_kernels.cpp index 9d75648ad0..2d9c1057fc 100644 --- a/src/rocjpeg_hip_kernels.cpp +++ b/src/rocjpeg_hip_kernels.cpp @@ -842,6 +842,43 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds } +__global__ void ColorConvertRGBAToRGBKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_image, uint32_t src_image_stride_in_bytes) { + + int x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 8; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + if (x >= dst_width || y >= dst_height) { + return; + } + + uint32_t src_idx = y * src_image_stride_in_bytes + (x << 2); + uint32_t dst_idx = y * dst_image_stride_in_bytes + (x * 3); + + DUINT8 src = *((DUINT8 *)(&src_image[src_idx])); + DUINT6 dst; + + dst.data[0] = hipPack(make_float4(hipUnpack0(src.data[0]), hipUnpack1(src.data[0]), hipUnpack2(src.data[0]), hipUnpack0(src.data[1]))); + dst.data[1] = hipPack(make_float4(hipUnpack1(src.data[1]), hipUnpack2(src.data[1]), hipUnpack0(src.data[2]), hipUnpack1(src.data[2]))); + dst.data[2] = hipPack(make_float4(hipUnpack2(src.data[2]), hipUnpack0(src.data[3]), hipUnpack1(src.data[3]), hipUnpack2(src.data[3]))); + dst.data[3] = hipPack(make_float4(hipUnpack0(src.data[4]), hipUnpack1(src.data[4]), hipUnpack2(src.data[4]), hipUnpack0(src.data[5]))); + dst.data[4] = hipPack(make_float4(hipUnpack1(src.data[5]), hipUnpack2(src.data[5]), hipUnpack0(src.data[6]), hipUnpack1(src.data[6]))); + dst.data[5] = hipPack(make_float4(hipUnpack2(src.data[6]), hipUnpack0(src.data[7]), hipUnpack1(src.data[7]), hipUnpack2(src.data[7]))); + + *((DUINT6 *)(&dst_image[dst_idx])) = dst; +} + +void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_image, uint32_t src_image_stride_in_bytes) { + int localThreads_x = 16; + int localThreads_y = 16; + int globalThreads_x = (dst_width + 7) >> 3; + int globalThreads_y = dst_height; + + ColorConvertRGBAToRGBKernel<<(globalThreads_x) / localThreads_x), ceil(static_cast(globalThreads_y) / localThreads_y)), + dim3(localThreads_x, localThreads_y), 0, stream >>>(dst_width, dst_height, dst_image, dst_image_stride_in_bytes, + src_image, src_image_stride_in_bytes); +} __global__ void ConvertInterleavedUVToPlanarUVKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes, diff --git a/src/rocjpeg_hip_kernels.h b/src/rocjpeg_hip_kernels.h index a279d318f4..8c981ebe81 100644 --- a/src/rocjpeg_hip_kernels.h +++ b/src/rocjpeg_hip_kernels.h @@ -44,6 +44,10 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes); +void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, + uint8_t *dst_image, uint32_t dst_image_stride_in_bytes, + const uint8_t *src_image, uint32_t src_image_stride_in_bytes); + void ConvertInterleavedUVToPlanarUV(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes, const uint8_t *src_image1, uint32_t src_image1_stride_in_bytes); @@ -59,4 +63,8 @@ typedef struct UINT6TYPE { uint data[6]; } DUINT6; +typedef struct UINT8TYPE { + uint data[8]; +} DUINT8; + #endif //ROC_JPEG_HIP_KERNELS_H_ \ No newline at end of file diff --git a/src/rocjpeg_vaapi_decoder.cpp b/src/rocjpeg_vaapi_decoder.cpp index ac62da7688..b81a6a0c01 100644 --- a/src/rocjpeg_vaapi_decoder.cpp +++ b/src/rocjpeg_vaapi_decoder.cpp @@ -25,7 +25,19 @@ THE SOFTWARE. RocJpegVappiDecoder::RocJpegVappiDecoder(int device_id) : device_id_{device_id}, drm_fd_{-1}, min_picture_width_{64}, min_picture_height_{64}, max_picture_width_{4096}, max_picture_height_{4096}, va_display_{0}, va_config_attrib_{{}}, va_config_id_{0}, va_profile_{VAProfileJPEGBaseline}, va_context_id_{0}, va_surface_ids_{}, va_picture_parameter_buf_id_{0}, va_quantization_matrix_buf_id_{0}, va_huffmantable_buf_id_{0}, - va_slice_param_buf_id_{0}, va_slice_data_buf_id_{0} {}; + va_slice_param_buf_id_{0}, va_slice_data_buf_id_{0}, current_vcn_jpeg_spec_{0} { + vcn_jpeg_spec_ = {{"gfx908", {2, false, false}}, + {"gfx90a", {2, false, false}}, + {"gfx940", {24, true, true}}, + {"gfx941", {32, true, true}}, + {"gfx942", {32, true, true}}, + {"gfx1030", {2, false, false}}, + {"gfx1031", {2, false, false}}, + {"gfx1032", {2, false, false}}, + {"gfx1100", {2, false, false}}, + {"gfx1101", {1, false, false}}, + {"gfx1102", {2, false, false}}}; + }; RocJpegVappiDecoder::~RocJpegVappiDecoder() { if (drm_fd_ != -1) { @@ -68,6 +80,10 @@ RocJpegStatus RocJpegVappiDecoder::InitializeDecoder(std::string device_name, st std::size_t pos = gcn_arch_name.find_first_of(":"); std::string gcn_arch_name_base = (pos != std::string::npos) ? gcn_arch_name.substr(0, pos) : gcn_arch_name; + auto it = vcn_jpeg_spec_.find(gcn_arch_name_base); + if (it != vcn_jpeg_spec_.end()) { + current_vcn_jpeg_spec_ = it->second; + } std::vector visible_devices; GetVisibleDevices(visible_devices); @@ -179,7 +195,7 @@ RocJpegStatus RocJpegVappiDecoder::DestroyDataBuffers() { return ROCJPEG_STATUS_SUCCESS; } -RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id) { +RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id, RocJpegOutputFormat output_format) { if (jpeg_stream_params == nullptr) { return ROCJPEG_STATUS_INVALID_PARAMETER; } @@ -199,28 +215,45 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED; } - uint8_t surface_format; - switch (jpeg_stream_params->chroma_subsampling) { - case CSS_444: - surface_format = VA_RT_FORMAT_YUV444; - break; - case CSS_422: - surface_format = VA_RT_FORMAT_YUV422; - break; - case CSS_420: - surface_format = VA_RT_FORMAT_YUV420; - break; - case CSS_400: - surface_format = VA_RT_FORMAT_YUV400; - break; - default: - ERR("ERROR: The chroma subsampling is not supported by the VCN hardware!"); - return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED; - break; + uint32_t surface_format; + VASurfaceAttrib surface_attrib; + surface_attrib.type = VASurfaceAttribPixelFormat; + surface_attrib.flags = VA_SURFACE_ATTRIB_SETTABLE; + surface_attrib.value.type = VAGenericValueTypeInteger; + + // If RGB output format is requested, and the HW JPEG decoder has a built-in format conversion, + // set the RGB surface format and attributes to obtain the RGB output directly from the JPEG HW decoder. + // otherwise set the appropriate surface format and attributes based on the chroma subsampling of the image. + if (output_format == ROCJPEG_OUTPUT_RGB && current_vcn_jpeg_spec_.can_convert_to_rgb) { + surface_format = VA_RT_FORMAT_RGB32; + surface_attrib.value.value.i = VA_FOURCC_RGBA; + } else { + switch (jpeg_stream_params->chroma_subsampling) { + case CSS_444: + surface_format = VA_RT_FORMAT_YUV444; + surface_attrib.value.value.i = VA_FOURCC_444P; + break; + case CSS_422: + surface_format = VA_RT_FORMAT_YUV422; + surface_attrib.value.value.i = ROCJPEG_FOURCC_YUYV; + break; + case CSS_420: + surface_format = VA_RT_FORMAT_YUV420; + surface_attrib.value.value.i = VA_FOURCC_NV12; + break; + case CSS_400: + surface_format = VA_RT_FORMAT_YUV400; + surface_attrib.value.value.i = VA_FOURCC_Y800; + break; + default: + ERR("ERROR: The chroma subsampling is not supported by the VCN hardware!"); + return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED; + break; + } } VASurfaceID va_surface_id; - CHECK_VAAPI(vaCreateSurfaces(va_display_, surface_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, &va_surface_id, 1, nullptr, 1)); + CHECK_VAAPI(vaCreateSurfaces(va_display_, surface_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, &va_surface_id, 1, &surface_attrib, 1)); va_surface_ids_.push_back(va_surface_id); surface_id = va_surface_id; diff --git a/src/rocjpeg_vaapi_decoder.h b/src/rocjpeg_vaapi_decoder.h index 7896a616cd..101c8061f4 100644 --- a/src/rocjpeg_vaapi_decoder.h +++ b/src/rocjpeg_vaapi_decoder.h @@ -32,6 +32,7 @@ THE SOFTWARE. #include #include #include +#include #include #include #include @@ -50,12 +51,18 @@ typedef enum { kCpx = 4, // Core Partition Accelerator } ComputePartition; +typedef struct { + uint32_t num_jpeg_cores; + bool can_convert_to_rgb; + bool can_roi_decode; +} VcnJpegSpec; + class RocJpegVappiDecoder { public: RocJpegVappiDecoder(int device_id = 0); ~RocJpegVappiDecoder(); RocJpegStatus InitializeDecoder(std::string device_name, std::string gcn_arch_name, int device_id); - RocJpegStatus SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id); + RocJpegStatus SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id, RocJpegOutputFormat output_format); RocJpegStatus ExportSurface(VASurfaceID surface_id, VADRMPRIMESurfaceDescriptor &va_drm_prime_surface_desc); RocJpegStatus SyncSurface(VASurfaceID surface_id); RocJpegStatus ReleaseSurface(VASurfaceID surface_id); @@ -72,6 +79,8 @@ private: VAProfile va_profile_; VAContextID va_context_id_; std::vector va_surface_ids_; + std::unordered_map vcn_jpeg_spec_; + VcnJpegSpec current_vcn_jpeg_spec_; VABufferID va_picture_parameter_buf_id_; VABufferID va_quantization_matrix_buf_id_; VABufferID va_huffmantable_buf_id_;