Add support for HW-based RGB format conversion on MI300 (#13)
* Add support for HW-based RGB format conversion on MI300 * code clean up * change variable names
Šī revīzija ir iekļauta:
revīziju iesūtīja
GitHub
vecāks
2f23ebe47b
revīzija
63dc882415
@@ -78,7 +78,7 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg
|
||||
|
||||
const JpegStreamParameters *jpeg_stream_params = jpeg_parser_.GetJpegStreamParameters();
|
||||
VASurfaceID current_surface_id;
|
||||
CHECK_ROCJPEG(jpeg_vaapi_decoder_.SubmitDecode(jpeg_stream_params, current_surface_id));
|
||||
CHECK_ROCJPEG(jpeg_vaapi_decoder_.SubmitDecode(jpeg_stream_params, current_surface_id, output_format));
|
||||
|
||||
if (destination != nullptr) {
|
||||
VADRMPRIMESurfaceDescriptor va_drm_prime_surface_desc = {};
|
||||
@@ -87,15 +87,16 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg
|
||||
CHECK_ROCJPEG(GetHipInteropMem(va_drm_prime_surface_desc));
|
||||
|
||||
uint16_t chroma_height = 0;
|
||||
CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height));
|
||||
|
||||
switch (output_format) {
|
||||
case ROCJPEG_OUTPUT_NATIVE:
|
||||
// copy the native decoded output buffers from interop memory directly to the destination buffers
|
||||
CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height));
|
||||
CHECK_ROCJPEG(CopyLuma(destination, jpeg_stream_params->picture_parameter_buffer.picture_height));
|
||||
CHECK_ROCJPEG(CopyChroma(destination, chroma_height));
|
||||
break;
|
||||
case ROCJPEG_OUTPUT_YUV_PLANAR:
|
||||
CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height));
|
||||
CHECK_ROCJPEG(GetPlanarYUVOutputFormat(jpeg_stream_params->picture_parameter_buffer.picture_width,
|
||||
jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height, destination));
|
||||
break;
|
||||
@@ -295,6 +296,10 @@ RocJpegStatus ROCJpegDecoder::ColorConvertToRGB(uint32_t picture_width, uint32_t
|
||||
ColorConvertYUV400ToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
|
||||
hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]);
|
||||
break;
|
||||
case VA_FOURCC_RGBA:
|
||||
ColorConvertRGBAToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
|
||||
hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]);
|
||||
break;
|
||||
default:
|
||||
ERR("ERROR! surface format is not supported!");
|
||||
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
|
||||
|
||||
@@ -842,6 +842,43 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds
|
||||
|
||||
}
|
||||
|
||||
__global__ void ColorConvertRGBAToRGBKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
|
||||
const uint8_t *src_image, uint32_t src_image_stride_in_bytes) {
|
||||
|
||||
int x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 8;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
|
||||
if (x >= dst_width || y >= dst_height) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t src_idx = y * src_image_stride_in_bytes + (x << 2);
|
||||
uint32_t dst_idx = y * dst_image_stride_in_bytes + (x * 3);
|
||||
|
||||
DUINT8 src = *((DUINT8 *)(&src_image[src_idx]));
|
||||
DUINT6 dst;
|
||||
|
||||
dst.data[0] = hipPack(make_float4(hipUnpack0(src.data[0]), hipUnpack1(src.data[0]), hipUnpack2(src.data[0]), hipUnpack0(src.data[1])));
|
||||
dst.data[1] = hipPack(make_float4(hipUnpack1(src.data[1]), hipUnpack2(src.data[1]), hipUnpack0(src.data[2]), hipUnpack1(src.data[2])));
|
||||
dst.data[2] = hipPack(make_float4(hipUnpack2(src.data[2]), hipUnpack0(src.data[3]), hipUnpack1(src.data[3]), hipUnpack2(src.data[3])));
|
||||
dst.data[3] = hipPack(make_float4(hipUnpack0(src.data[4]), hipUnpack1(src.data[4]), hipUnpack2(src.data[4]), hipUnpack0(src.data[5])));
|
||||
dst.data[4] = hipPack(make_float4(hipUnpack1(src.data[5]), hipUnpack2(src.data[5]), hipUnpack0(src.data[6]), hipUnpack1(src.data[6])));
|
||||
dst.data[5] = hipPack(make_float4(hipUnpack2(src.data[6]), hipUnpack0(src.data[7]), hipUnpack1(src.data[7]), hipUnpack2(src.data[7])));
|
||||
|
||||
*((DUINT6 *)(&dst_image[dst_idx])) = dst;
|
||||
}
|
||||
|
||||
void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
|
||||
const uint8_t *src_image, uint32_t src_image_stride_in_bytes) {
|
||||
int localThreads_x = 16;
|
||||
int localThreads_y = 16;
|
||||
int globalThreads_x = (dst_width + 7) >> 3;
|
||||
int globalThreads_y = dst_height;
|
||||
|
||||
ColorConvertRGBAToRGBKernel<<<dim3(ceil(static_cast<float>(globalThreads_x) / localThreads_x), ceil(static_cast<float>(globalThreads_y) / localThreads_y)),
|
||||
dim3(localThreads_x, localThreads_y), 0, stream >>>(dst_width, dst_height, dst_image, dst_image_stride_in_bytes,
|
||||
src_image, src_image_stride_in_bytes);
|
||||
}
|
||||
|
||||
__global__ void ConvertInterleavedUVToPlanarUVKernel(uint32_t dst_width, uint32_t dst_height,
|
||||
uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes,
|
||||
|
||||
@@ -44,6 +44,10 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds
|
||||
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
|
||||
const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes);
|
||||
|
||||
void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
|
||||
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
|
||||
const uint8_t *src_image, uint32_t src_image_stride_in_bytes);
|
||||
|
||||
void ConvertInterleavedUVToPlanarUV(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
|
||||
uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes,
|
||||
const uint8_t *src_image1, uint32_t src_image1_stride_in_bytes);
|
||||
@@ -59,4 +63,8 @@ typedef struct UINT6TYPE {
|
||||
uint data[6];
|
||||
} DUINT6;
|
||||
|
||||
typedef struct UINT8TYPE {
|
||||
uint data[8];
|
||||
} DUINT8;
|
||||
|
||||
#endif //ROC_JPEG_HIP_KERNELS_H_
|
||||
@@ -25,7 +25,19 @@ THE SOFTWARE.
|
||||
RocJpegVappiDecoder::RocJpegVappiDecoder(int device_id) : device_id_{device_id}, drm_fd_{-1}, min_picture_width_{64}, min_picture_height_{64},
|
||||
max_picture_width_{4096}, max_picture_height_{4096}, va_display_{0}, va_config_attrib_{{}}, va_config_id_{0}, va_profile_{VAProfileJPEGBaseline},
|
||||
va_context_id_{0}, va_surface_ids_{}, va_picture_parameter_buf_id_{0}, va_quantization_matrix_buf_id_{0}, va_huffmantable_buf_id_{0},
|
||||
va_slice_param_buf_id_{0}, va_slice_data_buf_id_{0} {};
|
||||
va_slice_param_buf_id_{0}, va_slice_data_buf_id_{0}, current_vcn_jpeg_spec_{0} {
|
||||
vcn_jpeg_spec_ = {{"gfx908", {2, false, false}},
|
||||
{"gfx90a", {2, false, false}},
|
||||
{"gfx940", {24, true, true}},
|
||||
{"gfx941", {32, true, true}},
|
||||
{"gfx942", {32, true, true}},
|
||||
{"gfx1030", {2, false, false}},
|
||||
{"gfx1031", {2, false, false}},
|
||||
{"gfx1032", {2, false, false}},
|
||||
{"gfx1100", {2, false, false}},
|
||||
{"gfx1101", {1, false, false}},
|
||||
{"gfx1102", {2, false, false}}};
|
||||
};
|
||||
|
||||
RocJpegVappiDecoder::~RocJpegVappiDecoder() {
|
||||
if (drm_fd_ != -1) {
|
||||
@@ -68,6 +80,10 @@ RocJpegStatus RocJpegVappiDecoder::InitializeDecoder(std::string device_name, st
|
||||
std::size_t pos = gcn_arch_name.find_first_of(":");
|
||||
std::string gcn_arch_name_base = (pos != std::string::npos) ? gcn_arch_name.substr(0, pos) : gcn_arch_name;
|
||||
|
||||
auto it = vcn_jpeg_spec_.find(gcn_arch_name_base);
|
||||
if (it != vcn_jpeg_spec_.end()) {
|
||||
current_vcn_jpeg_spec_ = it->second;
|
||||
}
|
||||
std::vector<int> visible_devices;
|
||||
GetVisibleDevices(visible_devices);
|
||||
|
||||
@@ -179,7 +195,7 @@ RocJpegStatus RocJpegVappiDecoder::DestroyDataBuffers() {
|
||||
return ROCJPEG_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id) {
|
||||
RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id, RocJpegOutputFormat output_format) {
|
||||
if (jpeg_stream_params == nullptr) {
|
||||
return ROCJPEG_STATUS_INVALID_PARAMETER;
|
||||
}
|
||||
@@ -199,28 +215,45 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg
|
||||
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
uint8_t surface_format;
|
||||
switch (jpeg_stream_params->chroma_subsampling) {
|
||||
case CSS_444:
|
||||
surface_format = VA_RT_FORMAT_YUV444;
|
||||
break;
|
||||
case CSS_422:
|
||||
surface_format = VA_RT_FORMAT_YUV422;
|
||||
break;
|
||||
case CSS_420:
|
||||
surface_format = VA_RT_FORMAT_YUV420;
|
||||
break;
|
||||
case CSS_400:
|
||||
surface_format = VA_RT_FORMAT_YUV400;
|
||||
break;
|
||||
default:
|
||||
ERR("ERROR: The chroma subsampling is not supported by the VCN hardware!");
|
||||
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
|
||||
break;
|
||||
uint32_t surface_format;
|
||||
VASurfaceAttrib surface_attrib;
|
||||
surface_attrib.type = VASurfaceAttribPixelFormat;
|
||||
surface_attrib.flags = VA_SURFACE_ATTRIB_SETTABLE;
|
||||
surface_attrib.value.type = VAGenericValueTypeInteger;
|
||||
|
||||
// If RGB output format is requested, and the HW JPEG decoder has a built-in format conversion,
|
||||
// set the RGB surface format and attributes to obtain the RGB output directly from the JPEG HW decoder.
|
||||
// otherwise set the appropriate surface format and attributes based on the chroma subsampling of the image.
|
||||
if (output_format == ROCJPEG_OUTPUT_RGB && current_vcn_jpeg_spec_.can_convert_to_rgb) {
|
||||
surface_format = VA_RT_FORMAT_RGB32;
|
||||
surface_attrib.value.value.i = VA_FOURCC_RGBA;
|
||||
} else {
|
||||
switch (jpeg_stream_params->chroma_subsampling) {
|
||||
case CSS_444:
|
||||
surface_format = VA_RT_FORMAT_YUV444;
|
||||
surface_attrib.value.value.i = VA_FOURCC_444P;
|
||||
break;
|
||||
case CSS_422:
|
||||
surface_format = VA_RT_FORMAT_YUV422;
|
||||
surface_attrib.value.value.i = ROCJPEG_FOURCC_YUYV;
|
||||
break;
|
||||
case CSS_420:
|
||||
surface_format = VA_RT_FORMAT_YUV420;
|
||||
surface_attrib.value.value.i = VA_FOURCC_NV12;
|
||||
break;
|
||||
case CSS_400:
|
||||
surface_format = VA_RT_FORMAT_YUV400;
|
||||
surface_attrib.value.value.i = VA_FOURCC_Y800;
|
||||
break;
|
||||
default:
|
||||
ERR("ERROR: The chroma subsampling is not supported by the VCN hardware!");
|
||||
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
VASurfaceID va_surface_id;
|
||||
CHECK_VAAPI(vaCreateSurfaces(va_display_, surface_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, &va_surface_id, 1, nullptr, 1));
|
||||
CHECK_VAAPI(vaCreateSurfaces(va_display_, surface_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, &va_surface_id, 1, &surface_attrib, 1));
|
||||
va_surface_ids_.push_back(va_surface_id);
|
||||
surface_id = va_surface_id;
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ THE SOFTWARE.
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <filesystem>
|
||||
#include <unordered_map>
|
||||
#include <va/va.h>
|
||||
#include <va/va_drm.h>
|
||||
#include <va/va_drmcommon.h>
|
||||
@@ -50,12 +51,18 @@ typedef enum {
|
||||
kCpx = 4, // Core Partition Accelerator
|
||||
} ComputePartition;
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_jpeg_cores;
|
||||
bool can_convert_to_rgb;
|
||||
bool can_roi_decode;
|
||||
} VcnJpegSpec;
|
||||
|
||||
class RocJpegVappiDecoder {
|
||||
public:
|
||||
RocJpegVappiDecoder(int device_id = 0);
|
||||
~RocJpegVappiDecoder();
|
||||
RocJpegStatus InitializeDecoder(std::string device_name, std::string gcn_arch_name, int device_id);
|
||||
RocJpegStatus SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id);
|
||||
RocJpegStatus SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id, RocJpegOutputFormat output_format);
|
||||
RocJpegStatus ExportSurface(VASurfaceID surface_id, VADRMPRIMESurfaceDescriptor &va_drm_prime_surface_desc);
|
||||
RocJpegStatus SyncSurface(VASurfaceID surface_id);
|
||||
RocJpegStatus ReleaseSurface(VASurfaceID surface_id);
|
||||
@@ -72,6 +79,8 @@ private:
|
||||
VAProfile va_profile_;
|
||||
VAContextID va_context_id_;
|
||||
std::vector<VASurfaceID> va_surface_ids_;
|
||||
std::unordered_map<std::string, VcnJpegSpec> vcn_jpeg_spec_;
|
||||
VcnJpegSpec current_vcn_jpeg_spec_;
|
||||
VABufferID va_picture_parameter_buf_id_;
|
||||
VABufferID va_quantization_matrix_buf_id_;
|
||||
VABufferID va_huffmantable_buf_id_;
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user