From 7b321a86edcaf1f2cf4819e79ef497cc4f4de5e3 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 22 Aug 2024 17:04:03 -0400 Subject: [PATCH] Add support for ROI decode on MI300 (#53) * Add support for ROI decode on MI300 * Add ROI decode support for batched decoding * clean up * clean up * Fix a typo and update sample's readme --- samples/jpegDecode/README.md | 1 + samples/jpegDecodeBatched/README.md | 1 + samples/jpegDecodeMultiThreads/README.md | 1 + samples/rocjpeg_samples_utils.h | 2 +- src/rocjpeg_decoder.cpp | 21 +++++++++--- src/rocjpeg_vaapi_decoder.cpp | 43 ++++++++++++++++++++++-- 6 files changed, 62 insertions(+), 7 deletions(-) diff --git a/samples/jpegDecode/README.md b/samples/jpegDecode/README.md index 3b38e0d173..af27e6c76b 100644 --- a/samples/jpegDecode/README.md +++ b/samples/jpegDecode/README.md @@ -22,5 +22,6 @@ make -j 1 for hybrid JPEG decoding using CPU and GPU HIP kernels (currently not supported)) [optional - default: 0]> -fmt <[output format] - select rocJPEG output format for decoding, one of the [native, yuv, y, rgb, rgb_planar] [optional - default: native]> -o <[output path] - path to an output file or a path to a directory - write decoded images to a file or directory based on selected output format [optional]> + -crop <[crop rectangle] - crop rectangle for output in a comma-separated format: left,top,right,bottom - [optional]> -d <[device id] - specify the GPU device id for the desired device (use 0 for the first device, 1 for the second device, and so on); [optional - default: 0]> ``` \ No newline at end of file diff --git a/samples/jpegDecodeBatched/README.md b/samples/jpegDecodeBatched/README.md index 29483a1474..9845724dbe 100644 --- a/samples/jpegDecodeBatched/README.md +++ b/samples/jpegDecodeBatched/README.md @@ -23,5 +23,6 @@ make -j -fmt <[output format] - select rocJPEG output format for decoding, one of the [native, yuv, y, rgb, rgb_planar] [optional - default: native]> -o <[output path] - path to an output file or a path to a directory - write decoded images to a file or directory based on selected output format [optional]> -d <[device id] - specify the GPU device id for the desired device (use 0 for the first device, 1 for the second device, and so on) - [optional - default: 0]> + -crop <[crop rectangle] - crop rectangle for output in a comma-separated format: left,top,right,bottom - [optional]> -b <[batch_size] - decode images from input by batches of a specified size - [optional - default: 2]> ``` \ No newline at end of file diff --git a/samples/jpegDecodeMultiThreads/README.md b/samples/jpegDecodeMultiThreads/README.md index 806c2ee03d..be082cb187 100644 --- a/samples/jpegDecodeMultiThreads/README.md +++ b/samples/jpegDecodeMultiThreads/README.md @@ -23,5 +23,6 @@ make -j -fmt <[output format] - select rocJPEG output format for decoding, one of the [native, yuv, y, rgb, rgb_planar] [optional - default: native]> -o <[output path] - path to an output file or a path to a directory - write decoded images to a file or directory based on selected output format [optional]> -d <[device id] - specify the GPU device id for the desired device (use 0 for the first device, 1 for the second device, and so on) [optional - default: 0]> + -crop <[crop rectangle] - crop rectangle for output in a comma-separated format: left,top,right,bottom - [optional]> -t <[threads] - number of threads for parallel JPEG decoding [optional - default: 2]> ``` \ No newline at end of file diff --git a/samples/rocjpeg_samples_utils.h b/samples/rocjpeg_samples_utils.h index b7cfadb101..ab474beb1c 100644 --- a/samples/rocjpeg_samples_utils.h +++ b/samples/rocjpeg_samples_utils.h @@ -635,7 +635,7 @@ private: " 1 for hybrid JPEG decoding using CPU and GPU HIP kernels (currently not supported)) [optional - default: 0]\n" "-fmt [output format] - select rocJPEG output format for decoding, one of the [native, yuv, y, rgb, rgb_planar] - [optional - default: native]\n" "-o [output path] - path to an output file or a path to an existing directory - write decoded images to a file or an existing directory based on selected output format - [optional]\n" - "-crop -crop [crop rectangle] - crop rectangle for output in a comma-separated format: left,top,right,bottom - [optional]\n" + "-crop [crop rectangle] - crop rectangle for output in a comma-separated format: left,top,right,bottom - [optional]\n" "-d [device id] - specify the GPU device id for the desired device (use 0 for the first device, 1 for the second device, and so on) [optional - default: 0]\n"; if (show_threads) { std::cout << "-t [threads] - number of threads for parallel JPEG decoding - [optional - default: 2]\n"; diff --git a/src/rocjpeg_decoder.cpp b/src/rocjpeg_decoder.cpp index 4681256481..a228884c6f 100644 --- a/src/rocjpeg_decoder.cpp +++ b/src/rocjpeg_decoder.cpp @@ -125,14 +125,21 @@ RocJpegStatus RocJpegDecoder::Decode(RocJpegStreamHandle jpeg_stream_handle, con uint32_t roi_height; roi_width = decode_params->crop_rectangle.right - decode_params->crop_rectangle.left; roi_height = decode_params->crop_rectangle.bottom - decode_params->crop_rectangle.top; - + if (roi_width > 0 && roi_height > 0 && roi_width <= jpeg_stream_params->picture_parameter_buffer.picture_width && roi_height <= jpeg_stream_params->picture_parameter_buffer.picture_height) { - is_roi_valid = true; + is_roi_valid = true; } picture_width = is_roi_valid ? roi_width : jpeg_stream_params->picture_parameter_buffer.picture_width; picture_height = is_roi_valid ? roi_height : jpeg_stream_params->picture_parameter_buffer.picture_height; - + + VcnJpegSpec current_vcn_jpeg_spec = jpeg_vaapi_decoder_.GetCurrentVcnJpegSpec(); + if (is_roi_valid && current_vcn_jpeg_spec.can_roi_decode) { + // Set is_roi_valid to false because in this case, the hardware handles the ROI decode and we don't + // need to calculate the roi_offset later in the following functions (e.g., CopyChannel, GetPlanarYUVOutputFormat, etc) to copy the crop rectangle + is_roi_valid = false; + } + switch (decode_params->output_format) { case ROCJPEG_OUTPUT_NATIVE: // Copy the native decoded output buffers from interop memory directly to the destination buffers @@ -227,12 +234,18 @@ RocJpegStatus RocJpegDecoder::DecodeBatched(RocJpegStreamHandle *jpeg_streams, i roi_height = decode_params->crop_rectangle.bottom - decode_params->crop_rectangle.top; if (roi_width > 0 && roi_height > 0 && roi_width <= jpeg_stream_params->picture_parameter_buffer.picture_width && roi_height <= jpeg_stream_params->picture_parameter_buffer.picture_height) { - is_roi_valid = true; + is_roi_valid = true; } picture_width = is_roi_valid ? roi_width : jpeg_stream_params->picture_parameter_buffer.picture_width; picture_height = is_roi_valid ? roi_height : jpeg_stream_params->picture_parameter_buffer.picture_height; + if (is_roi_valid && current_vcn_jpeg_spec.can_roi_decode) { + // Set is_roi_valid to false because in this case, the hardware handles the ROI decode and we don't need to calculate the roi_offset + // later in the following functions (e.g., CopyChannel, GetPlanarYUVOutputFormat, etc) to copy the crop rectangle + is_roi_valid = false; + } + switch (decode_params->output_format) { case ROCJPEG_OUTPUT_NATIVE: // Copy the native decoded output buffers from interop memory directly to the destination buffers diff --git a/src/rocjpeg_vaapi_decoder.cpp b/src/rocjpeg_vaapi_decoder.cpp index 4f791f83f8..5b6a163763 100644 --- a/src/rocjpeg_vaapi_decoder.cpp +++ b/src/rocjpeg_vaapi_decoder.cpp @@ -571,6 +571,26 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg } } + // if the HW JPEG decoder has a built-in ROI-decode capability then fill the requested crop rectangle to the picture parameter buffer + void *picture_parameter_buffer = (void*)&jpeg_stream_params->picture_parameter_buffer; + if (current_vcn_jpeg_spec_.can_roi_decode) { + uint32_t roi_width; + uint32_t roi_height; + roi_width = decode_params->crop_rectangle.right - decode_params->crop_rectangle.left; + roi_height = decode_params->crop_rectangle.bottom - decode_params->crop_rectangle.top; + if (roi_width > 0 && roi_height > 0 && roi_width <= jpeg_stream_params->picture_parameter_buffer.picture_width && roi_height <= jpeg_stream_params->picture_parameter_buffer.picture_height) { +#if VA_CHECK_VERSION(1, 21, 0) + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.x = decode_params->crop_rectangle.left; + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.y = decode_params->crop_rectangle.top; + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.width = roi_width; + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.height = roi_height; +#else + reinterpret_cast(picture_parameter_buffer)->va_reserved[0] = decode_params->crop_rectangle.top << 16 | decode_params->crop_rectangle.left; + reinterpret_cast(picture_parameter_buffer)->va_reserved[1] = roi_height << 16 | roi_width; +#endif + } + } + uint32_t surface_pixel_format = static_cast(surface_attrib.value.value.i); RocJpegVaapiMemPoolEntry mem_pool_entry = vaapi_mem_pool_->GetEntry(surface_pixel_format, jpeg_stream_params->picture_parameter_buffer.picture_width, jpeg_stream_params->picture_parameter_buffer.picture_height, 1); VAContextID va_context_id; @@ -592,7 +612,7 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecode(const JpegStreamParameters *jpeg CHECK_ROCJPEG(DestroyDataBuffers()); - CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAPictureParameterBufferType, sizeof(VAPictureParameterBufferJPEGBaseline), 1, (void *)&jpeg_stream_params->picture_parameter_buffer, &va_picture_parameter_buf_id_)); + CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAPictureParameterBufferType, sizeof(VAPictureParameterBufferJPEGBaseline), 1, picture_parameter_buffer, &va_picture_parameter_buf_id_)); CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAIQMatrixBufferType, sizeof(VAIQMatrixBufferJPEGBaseline), 1, (void *)&jpeg_stream_params->quantization_matrix_buffer, &va_quantization_matrix_buf_id_)); CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAHuffmanTableBufferType, sizeof(VAHuffmanTableBufferJPEGBaseline), 1, (void *)&jpeg_stream_params->huffman_table_buffer, &va_huffmantable_buf_id_)); CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VASliceParameterBufferType, sizeof(VASliceParameterBufferJPEGBaseline), 1, (void *)&jpeg_stream_params->slice_parameter_buffer, &va_slice_param_buf_id_)); @@ -680,6 +700,10 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecodeBatched(JpegStreamParameters *jpe surface_attrib.type = VASurfaceAttribPixelFormat; surface_attrib.flags = VA_SURFACE_ATTRIB_SETTABLE; surface_attrib.value.type = VAGenericValueTypeInteger; + uint32_t roi_width; + uint32_t roi_height; + roi_width = decode_params->crop_rectangle.right - decode_params->crop_rectangle.left; + roi_height = decode_params->crop_rectangle.bottom - decode_params->crop_rectangle.top; // Iterate through all entries of jpeg_stream_groups. // Check if there is a matching entry in the memory pool. @@ -715,8 +739,23 @@ RocJpegStatus RocJpegVappiDecoder::SubmitDecodeBatched(JpegStreamParameters *jpe } for (int idx : indices) { + // if the HW JPEG decoder has a built-in ROI-decode capability then fill the requested crop rectangle to the picture parameter buffer + void* picture_parameter_buffer = &jpeg_streams_params[idx].picture_parameter_buffer; + if (current_vcn_jpeg_spec_.can_roi_decode && roi_width > 0 && roi_height > 0 && + roi_width <= jpeg_streams_params[idx].picture_parameter_buffer.picture_width && + roi_height <= jpeg_streams_params[idx].picture_parameter_buffer.picture_height) { +#if VA_CHECK_VERSION(1, 21, 0) + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.x = decode_params->crop_rectangle.left; + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.y = decode_params->crop_rectangle.top; + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.width = roi_width; + reinterpret_cast(picture_parameter_buffer)->crop_rectangle.height = roi_height; +#else + reinterpret_cast(picture_parameter_buffer)->va_reserved[0] = decode_params->crop_rectangle.top << 16 | decode_params->crop_rectangle.left; + reinterpret_cast(picture_parameter_buffer)->va_reserved[1] = roi_height << 16 | roi_width; +#endif + } CHECK_ROCJPEG(DestroyDataBuffers()); - CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAPictureParameterBufferType, sizeof(VAPictureParameterBufferJPEGBaseline), 1, (void *)&jpeg_streams_params[idx].picture_parameter_buffer, &va_picture_parameter_buf_id_)); + CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAPictureParameterBufferType, sizeof(VAPictureParameterBufferJPEGBaseline), 1, picture_parameter_buffer, &va_picture_parameter_buf_id_)); CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAIQMatrixBufferType, sizeof(VAIQMatrixBufferJPEGBaseline), 1, (void *)&jpeg_streams_params[idx].quantization_matrix_buffer, &va_quantization_matrix_buf_id_)); CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VAHuffmanTableBufferType, sizeof(VAHuffmanTableBufferJPEGBaseline), 1, (void *)&jpeg_streams_params[idx].huffman_table_buffer, &va_huffmantable_buf_id_)); CHECK_VAAPI(vaCreateBuffer(va_display_, va_context_id, VASliceParameterBufferType, sizeof(VASliceParameterBufferJPEGBaseline), 1, (void *)&jpeg_streams_params[idx].slice_parameter_buffer, &va_slice_param_buf_id_));