Add support for HW-based RGB format conversion on MI300 (#13)

* Add support for HW-based RGB format conversion on MI300

* code clean up

* change variable names
Šī revīzija ir iekļauta:
Aryan Salmanpour
2024-04-11 13:37:08 -04:00
revīziju iesūtīja GitHub
vecāks 2f23ebe47b
revīzija 63dc882415
5 mainīti faili ar 116 papildinājumiem un 24 dzēšanām
+7 -2
Parādīt failu
@@ -78,7 +78,7 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg
const JpegStreamParameters *jpeg_stream_params = jpeg_parser_.GetJpegStreamParameters();
VASurfaceID current_surface_id;
CHECK_ROCJPEG(jpeg_vaapi_decoder_.SubmitDecode(jpeg_stream_params, current_surface_id));
CHECK_ROCJPEG(jpeg_vaapi_decoder_.SubmitDecode(jpeg_stream_params, current_surface_id, output_format));
if (destination != nullptr) {
VADRMPRIMESurfaceDescriptor va_drm_prime_surface_desc = {};
@@ -87,15 +87,16 @@ RocJpegStatus ROCJpegDecoder::Decode(const uint8_t *data, size_t length, RocJpeg
CHECK_ROCJPEG(GetHipInteropMem(va_drm_prime_surface_desc));
uint16_t chroma_height = 0;
CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height));
switch (output_format) {
case ROCJPEG_OUTPUT_NATIVE:
// copy the native decoded output buffers from interop memory directly to the destination buffers
CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height));
CHECK_ROCJPEG(CopyLuma(destination, jpeg_stream_params->picture_parameter_buffer.picture_height));
CHECK_ROCJPEG(CopyChroma(destination, chroma_height));
break;
case ROCJPEG_OUTPUT_YUV_PLANAR:
CHECK_ROCJPEG(GetChromaHeight(jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height));
CHECK_ROCJPEG(GetPlanarYUVOutputFormat(jpeg_stream_params->picture_parameter_buffer.picture_width,
jpeg_stream_params->picture_parameter_buffer.picture_height, chroma_height, destination));
break;
@@ -295,6 +296,10 @@ RocJpegStatus ROCJpegDecoder::ColorConvertToRGB(uint32_t picture_width, uint32_t
ColorConvertYUV400ToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]);
break;
case VA_FOURCC_RGBA:
ColorConvertRGBAToRGB(hip_stream_, picture_width, picture_height, destination->channel[0], destination->pitch[0],
hip_interop_.hip_mapped_device_mem, hip_interop_.pitch[0]);
break;
default:
ERR("ERROR! surface format is not supported!");
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
+37
Parādīt failu
@@ -842,6 +842,43 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds
}
__global__ void ColorConvertRGBAToRGBKernel(uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
const uint8_t *src_image, uint32_t src_image_stride_in_bytes) {
int x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 8;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
if (x >= dst_width || y >= dst_height) {
return;
}
uint32_t src_idx = y * src_image_stride_in_bytes + (x << 2);
uint32_t dst_idx = y * dst_image_stride_in_bytes + (x * 3);
DUINT8 src = *((DUINT8 *)(&src_image[src_idx]));
DUINT6 dst;
dst.data[0] = hipPack(make_float4(hipUnpack0(src.data[0]), hipUnpack1(src.data[0]), hipUnpack2(src.data[0]), hipUnpack0(src.data[1])));
dst.data[1] = hipPack(make_float4(hipUnpack1(src.data[1]), hipUnpack2(src.data[1]), hipUnpack0(src.data[2]), hipUnpack1(src.data[2])));
dst.data[2] = hipPack(make_float4(hipUnpack2(src.data[2]), hipUnpack0(src.data[3]), hipUnpack1(src.data[3]), hipUnpack2(src.data[3])));
dst.data[3] = hipPack(make_float4(hipUnpack0(src.data[4]), hipUnpack1(src.data[4]), hipUnpack2(src.data[4]), hipUnpack0(src.data[5])));
dst.data[4] = hipPack(make_float4(hipUnpack1(src.data[5]), hipUnpack2(src.data[5]), hipUnpack0(src.data[6]), hipUnpack1(src.data[6])));
dst.data[5] = hipPack(make_float4(hipUnpack2(src.data[6]), hipUnpack0(src.data[7]), hipUnpack1(src.data[7]), hipUnpack2(src.data[7])));
*((DUINT6 *)(&dst_image[dst_idx])) = dst;
}
void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height, uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
const uint8_t *src_image, uint32_t src_image_stride_in_bytes) {
int localThreads_x = 16;
int localThreads_y = 16;
int globalThreads_x = (dst_width + 7) >> 3;
int globalThreads_y = dst_height;
ColorConvertRGBAToRGBKernel<<<dim3(ceil(static_cast<float>(globalThreads_x) / localThreads_x), ceil(static_cast<float>(globalThreads_y) / localThreads_y)),
dim3(localThreads_x, localThreads_y), 0, stream >>>(dst_width, dst_height, dst_image, dst_image_stride_in_bytes,
src_image, src_image_stride_in_bytes);
}
__global__ void ConvertInterleavedUVToPlanarUVKernel(uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes,
+8
Parādīt failu
@@ -44,6 +44,10 @@ void ColorConvertYUV400ToRGB(hipStream_t stream, uint32_t dst_width, uint32_t ds
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
const uint8_t *src_luma_image, uint32_t src_luma_image_stride_in_bytes);
void ColorConvertRGBAToRGB(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image, uint32_t dst_image_stride_in_bytes,
const uint8_t *src_image, uint32_t src_image_stride_in_bytes);
void ConvertInterleavedUVToPlanarUV(hipStream_t stream, uint32_t dst_width, uint32_t dst_height,
uint8_t *dst_image1, uint8_t *dst_image2, uint32_t dst_image_stride_in_bytes,
const uint8_t *src_image1, uint32_t src_image1_stride_in_bytes);
@@ -59,4 +63,8 @@ typedef struct UINT6TYPE {
uint data[6];
} DUINT6;
typedef struct UINT8TYPE {
uint data[8];
} DUINT8;
#endif //ROC_JPEG_HIP_KERNELS_H_
+54 -21
Parādīt failu
@@ -25,7 +25,19 @@ THE SOFTWARE.
RocJpegVappiDecoder::RocJpegVappiDecoder(int device_id) : device_id_{device_id}, drm_fd_{-1}, min_picture_width_{64}, min_picture_height_{64},
max_picture_width_{4096}, max_picture_height_{4096}, va_display_{0}, va_config_attrib_{{}}, va_config_id_{0}, va_profile_{VAProfileJPEGBaseline},
va_context_id_{0}, va_surface_ids_{}, va_picture_parameter_buf_id_{0}, va_quantization_matrix_buf_id_{0}, va_huffmantable_buf_id_{0},
va_slice_param_buf_id_{0}, va_slice_data_buf_id_{0} {};
va_slice_param_buf_id_{0}, va_slice_data_buf_id_{0}, current_vcn_jpeg_spec_{0} {
vcn_jpeg_spec_ = {{"gfx908", {2, false, false}},
{"gfx90a", {2, false, false}},
{"gfx940", {24, true, true}},
{"gfx941", {32, true, true}},
{"gfx942", {32, true, true}},
{"gfx1030", {2, false, false}},
{"gfx1031", {2, false, false}},
{"gfx1032", {2, false, false}},
{"gfx1100", {2, false, false}},
{"gfx1101", {1, false, false}},
{"gfx1102", {2, false, false}}};
};
RocJpegVappiDecoder::~RocJpegVappiDecoder() {
if (drm_fd_ != -1) {
@@ -68,6 +80,10 @@ RocJpegStatus RocJpegVappiDecoder::InitializeDecoder(std::string device_name, st
std::size_t pos = gcn_arch_name.find_first_of(":");
std::string gcn_arch_name_base = (pos != std::string::npos) ? gcn_arch_name.substr(0, pos) : gcn_arch_name;
auto it = vcn_jpeg_spec_.find(gcn_arch_name_base);
if (it != vcn_jpeg_spec_.end()) {
current_vcn_jpeg_spec_ = it->second;
}
std::vector<int> visible_devices;
GetVisibleDevices(visible_devices);
@@ -179,7 +195,7 @@ RocJpegStatus RocJpegVappiDecoder::DestroyDataBuffers() {
return ROCJPEG_STATUS_SUCCESS;
}
RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id) {
RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id, RocJpegOutputFormat output_format) {
if (jpeg_stream_params == nullptr) {
return ROCJPEG_STATUS_INVALID_PARAMETER;
}
@@ -199,28 +215,45 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
}
uint8_t surface_format;
switch (jpeg_stream_params->chroma_subsampling) {
case CSS_444:
surface_format = VA_RT_FORMAT_YUV444;
break;
case CSS_422:
surface_format = VA_RT_FORMAT_YUV422;
break;
case CSS_420:
surface_format = VA_RT_FORMAT_YUV420;
break;
case CSS_400:
surface_format = VA_RT_FORMAT_YUV400;
break;
default:
ERR("ERROR: The chroma subsampling is not supported by the VCN hardware!");
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
break;
uint32_t surface_format;
VASurfaceAttrib surface_attrib;
surface_attrib.type = VASurfaceAttribPixelFormat;
surface_attrib.flags = VA_SURFACE_ATTRIB_SETTABLE;
surface_attrib.value.type = VAGenericValueTypeInteger;
// If RGB output format is requested, and the HW JPEG decoder has a built-in format conversion,
// set the RGB surface format and attributes to obtain the RGB output directly from the JPEG HW decoder.
// otherwise set the appropriate surface format and attributes based on the chroma subsampling of the image.
if (output_format == ROCJPEG_OUTPUT_RGB && current_vcn_jpeg_spec_.can_convert_to_rgb) {
surface_format = VA_RT_FORMAT_RGB32;
surface_attrib.value.value.i = VA_FOURCC_RGBA;
} else {
switch (jpeg_stream_params->chroma_subsampling) {
case CSS_444:
surface_format = VA_RT_FORMAT_YUV444;
surface_attrib.value.value.i = VA_FOURCC_444P;
break;
case CSS_422:
surface_format = VA_RT_FORMAT_YUV422;
surface_attrib.value.value.i = ROCJPEG_FOURCC_YUYV;
break;
case CSS_420:
surface_format = VA_RT_FORMAT_YUV420;
surface_attrib.value.value.i = VA_FOURCC_NV12;
break;
case CSS_400:
surface_format = VA_RT_FORMAT_YUV400;
surface_attrib.value.value.i = VA_FOURCC_Y800;
break;
default:
ERR("ERROR: The chroma subsampling is not supported by the VCN hardware!");
return ROCJPEG_STATUS_JPEG_NOT_SUPPORTED;
break;
}
}
VASurfaceID va_surface_id;
CHECK_VAAPI(vaCreateSurfaces(va_display_, surface_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, &va_surface_id, 1, nullptr, 1));
CHECK_VAAPI(vaCreateSurfaces(va_display_, surface_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, &va_surface_id, 1, &surface_attrib, 1));
va_surface_ids_.push_back(va_surface_id);
surface_id = va_surface_id;
+10 -1
Parādīt failu
@@ -32,6 +32,7 @@ THE SOFTWARE.
#include <fcntl.h>
#include <unistd.h>
#include <filesystem>
#include <unordered_map>
#include <va/va.h>
#include <va/va_drm.h>
#include <va/va_drmcommon.h>
@@ -50,12 +51,18 @@ typedef enum {
kCpx = 4, // Core Partition Accelerator
} ComputePartition;
typedef struct {
uint32_t num_jpeg_cores;
bool can_convert_to_rgb;
bool can_roi_decode;
} VcnJpegSpec;
class RocJpegVappiDecoder {
public:
RocJpegVappiDecoder(int device_id = 0);
~RocJpegVappiDecoder();
RocJpegStatus InitializeDecoder(std::string device_name, std::string gcn_arch_name, int device_id);
RocJpegStatus SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id);
RocJpegStatus SubmitDecode(const JpegStreamParameters *jpeg_stream_params, uint32_t &surface_id, RocJpegOutputFormat output_format);
RocJpegStatus ExportSurface(VASurfaceID surface_id, VADRMPRIMESurfaceDescriptor &va_drm_prime_surface_desc);
RocJpegStatus SyncSurface(VASurfaceID surface_id);
RocJpegStatus ReleaseSurface(VASurfaceID surface_id);
@@ -72,6 +79,8 @@ private:
VAProfile va_profile_;
VAContextID va_context_id_;
std::vector<VASurfaceID> va_surface_ids_;
std::unordered_map<std::string, VcnJpegSpec> vcn_jpeg_spec_;
VcnJpegSpec current_vcn_jpeg_spec_;
VABufferID va_picture_parameter_buf_id_;
VABufferID va_quantization_matrix_buf_id_;
VABufferID va_huffmantable_buf_id_;