From 39e274d02d5af4b22be278b8684742fe59a9688d Mon Sep 17 00:00:00 2001 From: Rajy Rawther Date: Mon, 18 Mar 2024 05:32:19 -0700 Subject: [PATCH] rocDecode resize sample implementation (#285) * WIP: resize kernels * hipified kernels for scaling from cuda * updated videodecodeRGB sample for scaling * added stream parameter to kernels * add scale kernels using tex2d and NN * enable tex2D kernels * add NN resize kernels * fixed scaling kernels * fixed tex2D scaling kernel for UV scaling * minor formatting * address review comments --------- Co-authored-by: Aryan Salmanpour --- samples/videoDecodeRGB/CMakeLists.txt | 4 +- samples/videoDecodeRGB/videodecrgb.cpp | 87 +++++-- utils/resize_kernels.cpp | 328 +++++++++++++++++++++++++ utils/resize_kernels.h | 103 ++++++++ utils/rocvideodecode/roc_video_dec.h | 4 + 5 files changed, 509 insertions(+), 17 deletions(-) create mode 100644 utils/resize_kernels.cpp create mode 100644 utils/resize_kernels.h diff --git a/samples/videoDecodeRGB/CMakeLists.txt b/samples/videoDecodeRGB/CMakeLists.txt index 27da19fb51..3a7c5f1d9d 100644 --- a/samples/videoDecodeRGB/CMakeLists.txt +++ b/samples/videoDecodeRGB/CMakeLists.txt @@ -87,7 +87,9 @@ if(HIP_FOUND AND FFMPEG_FOUND AND ROCDECODE_FOUND) list(APPEND SOURCES ${PROJECT_SOURCE_DIR} videodecrgb.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/rocvideodecode/roc_video_dec.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/colorspace_kernels.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/colorspace_kernels.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/resize_kernels.cpp) + add_executable(${PROJECT_NAME} ${SOURCES}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST}) diff --git a/samples/videoDecodeRGB/videodecrgb.cpp b/samples/videoDecodeRGB/videodecrgb.cpp index 2efc64ac4c..c07437cd0a 100644 --- a/samples/videoDecodeRGB/videodecrgb.cpp +++ b/samples/videoDecodeRGB/videodecrgb.cpp @@ -40,6 +40,7 @@ THE SOFTWARE. #include "video_demuxer.h" #include "roc_video_dec.h" #include "colorspace_kernels.h" +#include "resize_kernels.h" FILE *fpOut = nullptr; enum OutputFormatEnum { @@ -158,34 +159,70 @@ std::queue frame_queue[frame_buffers_size]; std::mutex mutex[frame_buffers_size]; std::condition_variable cv[frame_buffers_size]; -void ColorSpaceConversionThread(std::atomic& continue_processing, bool convert_to_rgb, OutputSurfaceInfo **surf_info, OutputFormatEnum e_output_format, - uint8_t *p_rgb_dev_mem, bool dump_output_frames, std::string &output_file_path, RocVideoDecoder &viddec) { - size_t rgb_image_size; +void ColorSpaceConversionThread(std::atomic& continue_processing, bool convert_to_rgb, Dim *p_resize_dim, OutputSurfaceInfo **surf_info, OutputSurfaceInfo **res_surf_info, + OutputFormatEnum e_output_format, uint8_t *p_rgb_dev_mem, uint8_t *p_resize_dev_mem, bool dump_output_frames, std::string &output_file_path, RocVideoDecoder &viddec) { + + size_t rgb_image_size, resize_image_size; hipError_t hip_status = hipSuccess; int current_frame_index = 0; uint8_t *frame; while (continue_processing || !frame_queue[current_frame_index].empty()) { + OutputSurfaceInfo *p_surf_info; + uint8_t *out_frame; { std::unique_lock lock(mutex[current_frame_index]); cv[current_frame_index].wait(lock, [&] {return !frame_queue[current_frame_index].empty() || !continue_processing;}); if (!continue_processing && frame_queue[current_frame_index].empty()) { break; } + p_surf_info = *surf_info; // Get the current frame at the curren_buffer index for processing frame = frame_queue[current_frame_index].front(); frame_queue[current_frame_index].pop(); + out_frame = frame; + } + if (p_resize_dim->w && p_resize_dim->h && *res_surf_info) { + // check if the resize dims are different from output dims + // resize is needed since output dims are different from resize dims + // TODO:: the below code assumes NV12/P016 for decoded output surface. Modify to take other surface formats in future + if (((*surf_info)->output_width != p_resize_dim->w) || ((*surf_info)->output_height != p_resize_dim->h)) { + resize_image_size = p_resize_dim->w * (p_resize_dim->h + (p_resize_dim->h >> 1)) * (*surf_info)->bytes_per_pixel; + if (p_resize_dev_mem == nullptr && resize_image_size > 0) { + hip_status = hipMalloc(&p_resize_dev_mem, resize_image_size); + if (hip_status != hipSuccess) { + std::cerr << "ERROR: hipMalloc failed to allocate the device memory for the output!" << hip_status << std::endl; + return; + } + } + // call resize kernel + if ((*surf_info)->bytes_per_pixel == 2) { + ResizeP016(p_resize_dev_mem, p_resize_dim->w * 2, p_resize_dim->w, p_resize_dim->h, frame, (*surf_info)->output_pitch, (*surf_info)->output_width, + (*surf_info)->output_height, (frame + (*surf_info)->output_vstride * (*surf_info)->output_pitch), nullptr, viddec.GetStream()); + } else { + ResizeNv12(p_resize_dev_mem, p_resize_dim->w, p_resize_dim->w, p_resize_dim->h, frame, (*surf_info)->output_pitch, (*surf_info)->output_width, + (*surf_info)->output_height, (frame + (*surf_info)->output_vstride * (*surf_info)->output_pitch), nullptr, viddec.GetStream()); + } + (*res_surf_info)->output_width = p_resize_dim->w; + (*res_surf_info)->output_height = p_resize_dim->h; + (*res_surf_info)->output_pitch = p_resize_dim->w * (*surf_info)->bytes_per_pixel; + (*res_surf_info)->output_vstride = p_resize_dim->h; + (*res_surf_info)->output_surface_size_in_bytes = (*res_surf_info)->output_pitch * (p_resize_dim->h + (p_resize_dim->h >> 1)); + (*res_surf_info)->mem_type = OUT_SURFACE_MEM_DEV_COPIED; + p_surf_info = *res_surf_info; + out_frame = p_resize_dev_mem; + } } if (convert_to_rgb) { int rgb_width; - if ((*surf_info)->bit_depth == 8) { - rgb_width = ((*surf_info)->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels - rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * (*surf_info)->output_height * 3 : rgb_width * (*surf_info)->output_height * 4; + if (p_surf_info->bit_depth == 8) { + rgb_width = (p_surf_info->output_width + 1) & ~1; // has to be a multiple of 2 for hip colorconvert kernels + rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * p_surf_info->output_height * 3 : rgb_width * p_surf_info->output_height * 4; } else { - rgb_width = ((*surf_info)->output_width + 1) & ~1; - rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * (*surf_info)->output_height * 3 : ((e_output_format == bgr48) || (e_output_format == rgb48)) ? - rgb_width * (*surf_info)->output_height * 6 : rgb_width * (*surf_info)->output_height * 8; + rgb_width = (p_surf_info->output_width + 1) & ~1; + rgb_image_size = ((e_output_format == bgr) || (e_output_format == rgb)) ? rgb_width * p_surf_info->output_height * 3 : ((e_output_format == bgr48) || (e_output_format == rgb48)) ? + rgb_width * p_surf_info->output_height * 6 : rgb_width * p_surf_info->output_height * 8; } if (p_rgb_dev_mem == nullptr) { hip_status = hipMalloc(&p_rgb_dev_mem, rgb_image_size); @@ -194,18 +231,17 @@ void ColorSpaceConversionThread(std::atomic& continue_processing, bool con return; } } - ColorConvertYUV2RGB(frame, *surf_info, p_rgb_dev_mem, e_output_format, viddec.GetStream()); + ColorConvertYUV2RGB(out_frame, p_surf_info, p_rgb_dev_mem, e_output_format, viddec.GetStream()); } if (dump_output_frames) { if (convert_to_rgb) - DumpRGBImage(output_file_path, p_rgb_dev_mem, *surf_info, rgb_image_size); + DumpRGBImage(output_file_path, p_rgb_dev_mem, p_surf_info, rgb_image_size); else - viddec.SaveFrameToFile(output_file_path, frame, *surf_info); + viddec.SaveFrameToFile(output_file_path, out_frame, p_surf_info); } cv[current_frame_index].notify_one(); current_frame_index = (current_frame_index + 1) % frame_buffers_size; - } } @@ -216,11 +252,13 @@ int main(int argc, char **argv) { bool convert_to_rgb = false; int device_id = 0; Rect crop_rect = {}; + Dim resize_dim = {}; Rect *p_crop_rect = nullptr; size_t rgb_image_size; uint32_t rgb_image_stride; hipError_t hip_status = hipSuccess; uint8_t *p_rgb_dev_mem = nullptr; + uint8_t *p_resize_dev_mem = nullptr; OutputSurfaceMemoryType mem_type = OUT_SURFACE_MEM_DEV_INTERNAL; OutputFormatEnum e_output_format = native; int rgb_width; @@ -268,6 +306,16 @@ int main(int argc, char **argv) { p_crop_rect = &crop_rect; continue; } + if (!strcmp(argv[i], "-resize")) { + if (++i == argc || 2 != sscanf(argv[i], "%dx%d", &resize_dim.w, &resize_dim.h)) { + ShowHelpAndExit("-resize"); + } + if (resize_dim.w % 2 == 1 || resize_dim.h % 2 == 1) { + std::cout << "Resizing dimensions must have width and height of even numbers" << std::endl; + exit(1); + } + continue; + } if (!strcmp(argv[i], "-of")) { if (++i == argc) { ShowHelpAndExit("-of"); @@ -302,13 +350,13 @@ int main(int argc, char **argv) { uint8_t *p_frame = nullptr; int64_t pts = 0; OutputSurfaceInfo *surf_info; + OutputSurfaceInfo *resize_surf_info = nullptr; uint32_t width, height; double total_dec_time = 0; convert_to_rgb = e_output_format != native; - std::atomic continue_processing(true); - std::thread color_space_conversion_thread(ColorSpaceConversionThread, std::ref(continue_processing), std::ref(convert_to_rgb), &surf_info, std::ref(e_output_format), - std::ref(p_rgb_dev_mem), std::ref(dump_output_frames), std::ref(output_file_path), std::ref(viddec)); + std::thread color_space_conversion_thread(ColorSpaceConversionThread, std::ref(continue_processing), std::ref(convert_to_rgb), &resize_dim, &surf_info, &resize_surf_info, std::ref(e_output_format), + std::ref(p_rgb_dev_mem), std::ref(p_resize_dev_mem), std::ref(dump_output_frames), std::ref(output_file_path), std::ref(viddec)); auto startTime = std::chrono::high_resolution_clock::now(); do { @@ -318,6 +366,10 @@ int main(int argc, char **argv) { std::cerr << "Error: Failed to get Output Image Info!" << std::endl; break; } + if (resize_dim.w && resize_dim.h && !resize_surf_info) { + resize_surf_info = new OutputSurfaceInfo; + memcpy(resize_surf_info, surf_info, sizeof(OutputSurfaceInfo)); + } int last_index = 0; for (int i = 0; i < n_frames_returned; i++) { @@ -386,6 +438,9 @@ int main(int argc, char **argv) { std::cout << info_message << total_dec_time / n_frame << std::endl; std::cout << "info: avg FPS: " << (n_frame / total_dec_time) * 1000 << std::endl; } + if (resize_surf_info != nullptr) { + delete resize_surf_info; + } } catch (const std::exception &ex) { std::cout << ex.what() << std::endl; exit(1); diff --git a/utils/resize_kernels.cpp b/utils/resize_kernels.cpp new file mode 100644 index 0000000000..5bf38df4b3 --- /dev/null +++ b/utils/resize_kernels.cpp @@ -0,0 +1,328 @@ +/* +Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "resize_kernels.h" +#include "roc_video_dec.h" + +#if !defined(__HIP_NO_IMAGE_SUPPORT) || !__HIP_NO_IMAGE_SUPPORT +/** + * @brief low level HIP kernel for Resize using tex2d + * + * @tparam YuvUnitx2 + * @param tex_y - text2D object Y pointer + * @param tex_uv - text2D object UV pointer + * @param p_dst - dst Y pointer + * @param p_dst_uv - dst UV pointer + * @param pitch - dst pitch + * @param width - dst width + * @param height - dst height + * @param fx_scale - xscale + * @param fy_scale - yscale + * @return + */ + +template +static __global__ void ResizeHip(hipTextureObject_t tex_y, hipTextureObject_t tex_uv, + uint8_t *p_dst, uint8_t *p_dst_uv, int pitch, int width, int height, + float fx_scale, float fy_scale) +{ + int ix = blockIdx.x * blockDim.x + threadIdx.x, + iy = blockIdx.y * blockDim.y + threadIdx.y; + + if (ix >= width / 2 || iy >= height / 2) { + return; + } + + int x = ix * 2, y = iy * 2; + typedef decltype(YuvUnitx2::x) YuvUnit; + const int max_yuv_value = (1 << (sizeof(YuvUnit) * 8)) - 1; + *(YuvUnitx2 *)(p_dst + y * pitch + x * sizeof(YuvUnit)) = YuvUnitx2 { + (YuvUnit)(tex2D(tex_y, x * fx_scale, y * fy_scale) * max_yuv_value), + (YuvUnit)(tex2D(tex_y, (x + 1) * fx_scale, y * fy_scale) * max_yuv_value) + }; + y++; + *(YuvUnitx2 *)(p_dst + y * pitch + x * sizeof(YuvUnit)) = YuvUnitx2 { + (YuvUnit)(tex2D(tex_y, x * fx_scale, y * fy_scale) * max_yuv_value), + (YuvUnit)(tex2D(tex_y, (x + 1) * fx_scale, y * fy_scale) * max_yuv_value) + }; + float2 uv = tex2D(tex_uv, ix * fx_scale, iy * fy_scale + 0.5f); + *(YuvUnitx2 *)(p_dst_uv + iy * pitch + ix * 2 * sizeof(YuvUnit)) = YuvUnitx2{ (YuvUnit)(uv.x * max_yuv_value), (YuvUnit)(uv.y * max_yuv_value) }; +} +#endif + +/** + * @brief low level HIP kernel for Resize using nearest neighbor interpolation + * + * @tparam YuvUnitx2 + * @param p_src - src Y pointer + * @param p_src_uv - src UV pointer + * @param src_pitch - src pitch + * @param p_dst - dst Y pointer + * @param p_dst_uv - dst UV pointer + * @param pitch - dst pitch + * @param width - dst width + * @param height - dst height + * @param fx_scale - xscale + * @param fy_scale - yscale + * @return + */ + +template +static __global__ void ResizeHip(uint8_t *p_src, uint8_t *p_src_uv, int src_pitch, + uint8_t *p_dst, uint8_t *p_dst_uv, int pitch, int width, int height, float fx_scale, float fy_scale) { + + int ix = blockIdx.x * blockDim.x + threadIdx.x, + iy = blockIdx.y * blockDim.y + threadIdx.y; + + if (ix >= width / 2 || iy >= height / 2) { + return; + } + + int x = ix * 2, y = iy * 2; + typedef decltype(YuvUnitx2::x) YuvUnit; + uint8_t *p_src_y = p_src + src_pitch * static_cast(fmaf(y, fy_scale, 0.5 * fy_scale)); + *(YuvUnitx2 *)(p_dst + y * pitch + x * sizeof(YuvUnit)) = YuvUnitx2 { + *(YuvUnit *)(p_src_y + static_cast(fmaf(x, fx_scale, 0.5 * fx_scale)) * sizeof(YuvUnit)), + *(YuvUnit *)(p_src_y + static_cast(fmaf(x + 1, fx_scale, 0.5 * fx_scale) * sizeof(YuvUnit))) + }; + y++; + p_src_y = p_src + src_pitch * static_cast(fmaf(y, fy_scale, 0.5 * fy_scale)); + *(YuvUnitx2 *)(p_dst + y * pitch + x * sizeof(YuvUnit)) = YuvUnitx2 { + *(YuvUnit *)(p_src_y + static_cast(fmaf(x, fx_scale, 0.5 * fx_scale)) * sizeof(YuvUnit)), + *(YuvUnit *)(p_src_y + static_cast(fmaf(x + 1, fx_scale, 0.5 * fx_scale)) * sizeof(YuvUnit)) + }; + YuvUnit *p_uv = (YuvUnit *) (p_src_uv + static_cast(fmaf(ix, fx_scale, fx_scale * 0.5)) * sizeof(YuvUnit) * 2 + + src_pitch * static_cast(fmaf(iy, fy_scale, 0.5 * fy_scale))); + *(YuvUnitx2 *)(p_dst_uv + iy * pitch + ix * 2 * sizeof(YuvUnit)) = YuvUnitx2{ (YuvUnit)p_uv[0], (YuvUnit)p_uv[1] }; +} + + +template +static void Resize(unsigned char *p_dst, unsigned char* p_dst_uv, int dst_pitch, int dst_width, int dst_height, + unsigned char *p_src, unsigned char *p_src_uv, int src_pitch, int src_width, int src_height, hipStream_t hip_stream) { +#if !defined(__HIP_NO_IMAGE_SUPPORT) || !__HIP_NO_IMAGE_SUPPORT + hipResourceDesc res_desc = {}; + res_desc.resType = hipResourceTypePitch2D; + res_desc.res.pitch2D.devPtr = p_src; + res_desc.res.pitch2D.desc = hipCreateChannelDesc(); + res_desc.res.pitch2D.width = src_width; + res_desc.res.pitch2D.height = src_height; + res_desc.res.pitch2D.pitchInBytes = src_pitch; + + hipTextureDesc tex_desc = {}; + tex_desc.filterMode = hipFilterModeLinear; + tex_desc.readMode = hipReadModeNormalizedFloat; + + hipTextureObject_t tex_y=0; + HIP_API_CALL(hipCreateTextureObject(&tex_y, &res_desc, &tex_desc, NULL)); + + res_desc.res.pitch2D.devPtr = p_src_uv; + res_desc.res.pitch2D.desc = hipCreateChannelDesc(); + res_desc.res.pitch2D.width = src_width >> 1; + res_desc.res.pitch2D.height = src_height / 2; + + hipTextureObject_t tex_uv=0; + HIP_API_CALL(hipCreateTextureObject(&tex_uv, &res_desc, &tex_desc, NULL)); + + ResizeHip <<>>(tex_y, tex_uv, p_dst, p_dst_uv, + dst_pitch, dst_width, dst_height, 1.0f * src_width / dst_width, 1.0f * src_height / dst_height); + + HIP_API_CALL(hipDestroyTextureObject(tex_y)); + HIP_API_CALL(hipDestroyTextureObject(tex_uv)); +#else + ResizeHip <<>>(p_src, p_src_uv, src_pitch, p_dst, p_dst_uv, + dst_pitch, dst_width, dst_height, 1.0f * src_width / dst_width, 1.0f * src_height / dst_height); +#endif +} + +void ResizeNv12(unsigned char *p_dst_nv12, int dst_pitch, int dst_width, int dst_height, unsigned char *p_src_nv12, + int src_pitch, int src_width, int src_height, unsigned char* p_src_nv12_uv, unsigned char* p_dst_nv12_uv, hipStream_t hip_stream) +{ + unsigned char* p_src_uv = p_src_nv12_uv ? p_src_nv12_uv : p_src_nv12 + (src_pitch*src_height); + unsigned char* p_dst_uv = p_dst_nv12_uv ? p_dst_nv12_uv : p_dst_nv12 + (dst_pitch*dst_height); + return Resize(p_dst_nv12, p_dst_uv, dst_pitch, dst_width, dst_height, p_src_nv12, p_src_uv, src_pitch, src_width, src_height, hip_stream); +} + + +void ResizeP016(unsigned char *p_dst_p016, int dst_pitch, int dst_width, int dst_height, unsigned char *p_src_p016, + int src_pitch, int src_width, int src_height, unsigned char* p_src_p016_uv, unsigned char* p_dst_p016_uv, hipStream_t hip_stream) +{ + unsigned char* p_src_uv = p_src_p016_uv ? p_src_p016_uv : p_src_p016 + (src_pitch*src_height); + unsigned char* p_dst_uv = p_dst_p016_uv ? p_dst_p016_uv : p_dst_p016 + (dst_pitch*dst_height); + return Resize(p_dst_p016, p_dst_uv, dst_pitch, dst_width, dst_height, p_src_p016, p_src_uv, src_pitch, src_width, src_height, hip_stream); +} + +#if !defined(__HIP_NO_IMAGE_SUPPORT) || !__HIP_NO_IMAGE_SUPPORT +static __global__ void Scale_tex2D(hipTextureObject_t tex_src, uint8_t *p_dst, int pitch, int width, + int height, float fx_scale, float fy_scale) { + int x = blockIdx.x * blockDim.x + threadIdx.x, + y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) { + return; + } + + *(unsigned char*)(p_dst + (y * pitch) + x) = (unsigned char)(fminf((tex2D(tex_src, x * fx_scale, y * fy_scale)) * 255.0f, 255.0f)); +} + +static __global__ void Scale_UV_tex2D(hipTextureObject_t tex_src, uint8_t *p_dst, int pitch, int width, + int height, float fx_scale, float fy_scale) { + int x = blockIdx.x * blockDim.x + threadIdx.x, + y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height){ + return; + } + float2 uv = tex2D(tex_src, x * fx_scale, y * fy_scale); + uchar2 dst_uv = uchar2{ (unsigned char)(fminf(uv.x * 255.0f, 255.0f)), (unsigned char)(fminf(uv.y * 255.0f, 255.0f)) }; + + *(uchar2*)(p_dst + (y * pitch) + 2 * x) = dst_uv; +} +#endif + +static __global__ void Scale(uint8_t *p_src, int src_pitch, uint8_t *p_dst, int pitch, int width, + int height, float fx_scale, float fy_scale) { + int x = blockIdx.x * blockDim.x + threadIdx.x, + y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height){ + return; + } + + // do nearest neighbor interpolation + uint8_t *p_src_xy = p_src + src_pitch * static_cast(fmaf(y, fy_scale, 0.5 * fy_scale)) + static_cast(fmaf(x, fx_scale, 0.5*fx_scale)); + *(uint8_t*)(p_dst + (y * pitch) + x) = *p_src_xy; +} + +static __global__ void Scale_UV(uint8_t *p_src, int src_pitch, uint8_t *p_dst, int pitch, int width, + int height, float fx_scale, float fy_scale) { + int x = blockIdx.x * blockDim.x + threadIdx.x, + y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) { + return; + } + // do nearest neighbor interpolation + uint8_t *p_src_uv = p_src + src_pitch * static_cast(fmaf(y , fy_scale, 0.5 * fy_scale)) + static_cast(fmaf(x, fx_scale, 0.5 * fx_scale)) * 2; + uchar2 dst_uv = uchar2{ p_src_uv[0], p_src_uv[1] }; + *(uchar2*)(p_dst + (y * pitch) + 2 * x) = dst_uv; +} + +/** + * @brief Resize a single plane of Y/U/V or UV interleaved (reserved for future) + * + * @param dp_dst - dest pointer + * @param dst_pitch - Pitch of the dst plane + * @param dst_width - Width of the dst plane + * @param dst_height - Height of the dst plane + * @param dp_src - source pointer + * @param src_pitch - source pitch + * @param src_width - source width + * @param src_height - source height + * @param b_resize_uv - to resize UV plance or not + * @param hip_stream - Stream for launching the kernel + */ +void ResizeYUVHipLaunchKernel(uint8_t *dp_dst, int dst_pitch, int dst_width, int dst_height, uint8_t *dp_src, int src_pitch, + int src_width, int src_height, bool b_resize_uv, hipStream_t hip_stream) { + +#if !defined(__HIP_NO_IMAGE_SUPPORT) || !__HIP_NO_IMAGE_SUPPORT + hipResourceDesc res_desc = {}; + res_desc.resType = hipResourceTypePitch2D; + res_desc.res.pitch2D.devPtr = dp_src; + res_desc.res.pitch2D.desc = b_resize_uv ? hipCreateChannelDesc() : hipCreateChannelDesc(); + res_desc.res.pitch2D.width = src_width; + res_desc.res.pitch2D.height = src_height; + res_desc.res.pitch2D.pitchInBytes = src_pitch; + + hipTextureDesc tex_desc = {}; + tex_desc.filterMode = hipFilterModeLinear; + tex_desc.readMode = hipReadModeNormalizedFloat; + + tex_desc.addressMode[0] = hipAddressModeClamp; + tex_desc.addressMode[1] = hipAddressModeClamp; + tex_desc.addressMode[2] = hipAddressModeClamp; + + hipTextureObject_t tex_src = 0; + HIP_API_CALL(hipCreateTextureObject(&tex_src, &res_desc, &tex_desc, NULL)); + + dim3 blockSize(16, 16, 1); + dim3 gridSize(((uint32_t)dst_width + blockSize.x - 1) / blockSize.x, ((uint32_t)dst_height + blockSize.y - 1) / blockSize.y, 1); + + if (b_resize_uv){ + Scale_UV_tex2D <<>>(tex_src, dp_dst, + dst_pitch, dst_width, dst_height, 1.0f * src_width / dst_width, 1.0f * src_height / dst_height); + } + else{ + Scale_tex2D <<>>(tex_src, dp_dst, + dst_pitch, dst_width, dst_height, 1.0f * src_width / dst_width, 1.0f * src_height / dst_height); + } + + HIP_API_CALL(hipGetLastError()); + HIP_API_CALL(hipDestroyTextureObject(tex_src)); +#else + dim3 blockSize(16, 16, 1); + dim3 gridSize(((uint32_t)dst_width + blockSize.x - 1) / blockSize.x, ((uint32_t)dst_height + blockSize.y - 1) / blockSize.y, 1); + + if (b_resize_uv) { + Scale_UV <<>>(dp_src, src_pitch, dp_dst, + dst_pitch, dst_width, dst_height, 1.0f * src_width / dst_width, 1.0f * src_height / dst_height); + } + else { + Scale <<>>(dp_src, src_pitch, dp_dst, + dst_pitch, dst_width, dst_height, 1.0f * src_width / dst_width, 1.0f * src_height / dst_height); + } +#endif + +} + +void ResizeYUV420(uint8_t *p_dst_y, + uint8_t* p_dst_u, + uint8_t* p_dst_v, + int dst_pitch_y, + int dst_pitch_uv, + int dst_width, + int dst_height, + uint8_t *p_src_y, + uint8_t* p_src_u, + uint8_t* p_src_v, + int src_pitch_y, + int src_pitch_uv, + int src_width, + int src_height, + bool b_nv12, + hipStream_t hip_stream) { + + int uv_width_dst = (dst_width + 1) >> 1; + int uv_height_dst = (dst_width + 1) >> 1; + int uv_width_src = (src_width + 1) >> 1; + int uv_height_src = (src_height + 1) >> 1; + + // Scale Y plane + ResizeYUVHipLaunchKernel(p_dst_y, dst_pitch_y, dst_width, dst_height, p_src_y, src_pitch_y, src_width, src_height, 0, hip_stream); + if (b_nv12) { + ResizeYUVHipLaunchKernel(p_dst_u, dst_pitch_uv, uv_width_dst, uv_height_dst, p_src_u, src_pitch_uv, uv_width_src, uv_height_src, b_nv12, hip_stream); + } else { + ResizeYUVHipLaunchKernel(p_dst_u, dst_pitch_uv, uv_width_dst, uv_height_dst, p_src_u, src_pitch_uv, uv_width_src, uv_height_src, b_nv12, hip_stream); + ResizeYUVHipLaunchKernel(p_dst_v, dst_pitch_uv, uv_width_dst, uv_height_dst, p_src_v, src_pitch_uv, uv_width_src, uv_height_src, b_nv12, hip_stream); + } +} + diff --git a/utils/resize_kernels.h b/utils/resize_kernels.h new file mode 100644 index 0000000000..d788b63aff --- /dev/null +++ b/utils/resize_kernels.h @@ -0,0 +1,103 @@ +/* +Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once +#include +#include + + +/** + * @brief Function to resize both planes of an NV12 image + * + * + * @param p_dst_nv12 - destination pointer Y plane + * @param dst_pitch - destination pitch + * @param dst_width - destination width + * @param dst_height - destination height + * @param p_src_nv12 - source pointer + * @param src_pitch - source pitch + * @param src_width - source width + * @param src_height - source height + * @param p_src_nv12_uv - source pointer of UV plane + * @param hip_stream - Stream for launching the kernel + */ +void ResizeNv12(uint8_t *p_dst_nv12, int dst_pitch, int dst_width, int dst_height, uint8_t *p_src_nv12, + int src_pitch, int src_width, int src_height, unsigned char* p_src_nv12_uv, unsigned char* p_dst_nv12_uv, hipStream_t hip_stream); + +/** + * @brief + * + * @param p_dst_p016 + * @param dst_pitch + * @param dst_width + * @param dst_height + * @param p_src_p016 + * @param src_pitch + * @param src_width + * @param src_height + * @param p_src_p016_uv + * @param p_dst_p016_uv + * @param hip_stream - Stream for launching the kernel + */ +void ResizeP016(uint8_t *p_dst_p016, int dst_pitch, int dst_width, int dst_height, uint8_t *p_src_p016, int src_pitch, + int src_width, int src_height, unsigned char* p_src_p016_uv, unsigned char* p_dst_p016_uv, hipStream_t hip_stream); + +/** + * @brief Function to resize 420 YUV image + * + * @param p_dst_y - Destination Y plane pointer + * @param p_dst_u - Destination U plane pointer + * @param p_dst_v - Destination V plane pointer + * @param dst_pitch_y - Destination Pitch Y + * @param dst_pitch_uv - Destination Pitch UV + * @param dst_width - Destination Width + * @param dst_height - Destination Height + * @param p_src_y - Src Y plane pointer + * @param p_src_u - Src U plane pointer + * @param p_src_v - Src V plane pointer + * @param src_pitch_y - Src Pitch Y + * @param src_pitch_uv - Src Pitch UV + * @param src_width - Src Width + * @param src_height - Src Height + * @param b_nv12 - Is uv interleaved? + * @param hip_stream - Stream for launching the kernel + */ +void ResizeYUV420(uint8_t *p_dst_y, uint8_t* p_dst_u, uint8_t* p_dst_v, int dst_pitch_y, int dst_pitch_uv, + int dst_width, int dst_height, uint8_t *p_src_y, uint8_t* p_src_u, uint8_t* p_src_v, + int src_pitch_y, int src_pitch_uv, int src_width, int src_height, bool b_nv12 = false, hipStream_t hip_stream = nullptr); + +/** + * @brief The function to launch ResizeYUV HIP kernel + * + * @param dp_dst - dest pointer + * @param dst_pitch - Pitch of the dst plane + * @param dst_width - Width of the dst plane + * @param dst_height - Height of the dst plane + * @param dp_src - source pointer + * @param src_pitch - source pitch + * @param src_width - source width + * @param src_height - source height + * @param b_resize_uv - to resize UV plance or not + * @param hip_stream - Stream for launching the kernel + */ +void ResizeYUVHipLaunchKernel(uint8_t *dp_dst, int dst_pitch, int dst_width, int dst_height, uint8_t *dp_src, int src_pitch, + int src_width, int src_height, bool b_resize_uv = false, hipStream_t hip_stream = nullptr); diff --git a/utils/rocvideodecode/roc_video_dec.h b/utils/rocvideodecode/roc_video_dec.h index e9bb5324d1..408e5f8493 100644 --- a/utils/rocvideodecode/roc_video_dec.h +++ b/utils/rocvideodecode/roc_video_dec.h @@ -124,6 +124,10 @@ struct Rect { int bottom; }; +struct Dim { + int w, h; +}; + static inline int align(int value, int alignment) { return (value + alignment - 1) & ~(alignment - 1); }