diff --git a/runtime/hsa-runtime/CMakeLists.txt b/runtime/hsa-runtime/CMakeLists.txt index a208b107e6..d3309655bb 100644 --- a/runtime/hsa-runtime/CMakeLists.txt +++ b/runtime/hsa-runtime/CMakeLists.txt @@ -248,6 +248,7 @@ if(${IMAGE_SUPPORT}) image/image_manager_ai.cpp image/image_manager_nv.cpp image/image_manager_gfx11.cpp + image/image_manager_gfx12.cpp image/image_lut_kv.cpp image/image_lut_gfx11.cpp image/blit_object_gfx7xx.cpp diff --git a/runtime/hsa-runtime/image/image_manager_gfx12.cpp b/runtime/hsa-runtime/image/image_manager_gfx12.cpp new file mode 100644 index 0000000000..14a37b8fec --- /dev/null +++ b/runtime/hsa-runtime/image/image_manager_gfx12.cpp @@ -0,0 +1,896 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#define NOMINMAX +#include "image_manager_gfx12.h" + +#include + +#include +#include + +#include "inc/hsa_ext_amd.h" +#include "core/inc/hsa_internal.h" +#include "core/util/utils.h" +#include "addrlib/src/core/addrlib.h" +#include "image_runtime.h" +#include "resource.h" +#include "resource_gfx12.h" +#include "util.h" +#include "device_info.h" + +namespace rocr { +namespace image { + +static_assert(sizeof(SQ_BUF_RSRC_WORD0) == sizeof(uint32_t)); +static_assert(sizeof(SQ_BUF_RSRC_WORD1) == sizeof(uint32_t)); +static_assert(sizeof(SQ_BUF_RSRC_WORD2) == sizeof(uint32_t)); +static_assert(sizeof(SQ_BUF_RSRC_WORD3) == sizeof(uint32_t)); + +static_assert(sizeof(SQ_IMG_RSRC_WORD0) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD1) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD2) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD3) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD4) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD5) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD6) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD7) == sizeof(uint32_t)); + +static_assert(sizeof(SQ_IMG_SAMP_WORD0) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_SAMP_WORD1) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_SAMP_WORD2) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_SAMP_WORD3) == sizeof(uint32_t)); + +//----------------------------------------------------------------------------- +// Workaround switch to combined format/type codes and missing gfx11 +// specific look up table. Only covers types used in image_lut_gfx11.cpp. +//----------------------------------------------------------------------------- +struct formatconverstion_t { + FMT fmt; + type type; + FORMAT format; +}; + +// Format/Type to combined format code table. +// Sorted and indexed to allow fast searches. +static const formatconverstion_t FormatLUT[] = { + {FMT_1_5_5_5, TYPE_UNORM, CFMT_1_5_5_5_UNORM}, // 0 + {FMT_10_10_10_2, TYPE_UNORM, CFMT_10_10_10_2_UNORM}, // 1 + {FMT_10_10_10_2, TYPE_SNORM, CFMT_10_10_10_2_SNORM}, // 2 + {FMT_10_10_10_2, TYPE_UINT, CFMT_10_10_10_2_UINT}, // 3 + {FMT_10_10_10_2, TYPE_SINT, CFMT_10_10_10_2_SINT}, // 4 + {FMT_16, TYPE_UNORM, CFMT_16_UNORM}, // 5 + {FMT_16, TYPE_SNORM, CFMT_16_SNORM}, // 6 + {FMT_16, TYPE_UINT, CFMT_16_UINT}, // 7 + {FMT_16, TYPE_SINT, CFMT_16_SINT}, // 8 + {FMT_16, TYPE_FLOAT, CFMT_16_FLOAT}, // 9 + {FMT_16, TYPE_USCALED, CFMT_16_USCALED}, // 10 + {FMT_16, TYPE_SSCALED, CFMT_16_SSCALED}, // 11 + {FMT_16_16, TYPE_UNORM, CFMT_16_16_UNORM}, // 12 + {FMT_16_16, TYPE_SNORM, CFMT_16_16_SNORM}, // 13 + {FMT_16_16, TYPE_UINT, CFMT_16_16_UINT}, // 14 + {FMT_16_16, TYPE_SINT, CFMT_16_16_SINT}, // 15 + {FMT_16_16, TYPE_FLOAT, CFMT_16_16_FLOAT}, // 16 + {FMT_16_16, TYPE_USCALED, CFMT_16_16_USCALED}, // 17 + {FMT_16_16, TYPE_SSCALED, CFMT_16_16_SSCALED}, // 18 + {FMT_16_16_16_16, TYPE_UNORM, CFMT_16_16_16_16_UNORM}, // 19 + {FMT_16_16_16_16, TYPE_SNORM, CFMT_16_16_16_16_SNORM}, // 20 + {FMT_16_16_16_16, TYPE_UINT, CFMT_16_16_16_16_UINT}, // 21 + {FMT_16_16_16_16, TYPE_SINT, CFMT_16_16_16_16_SINT}, // 22 + {FMT_16_16_16_16, TYPE_FLOAT, CFMT_16_16_16_16_FLOAT}, // 23 + {FMT_16_16_16_16, TYPE_USCALED, CFMT_16_16_16_16_USCALED}, // 24 + {FMT_16_16_16_16, TYPE_SSCALED, CFMT_16_16_16_16_SSCALED}, // 25 + {FMT_2_10_10_10, TYPE_UNORM, CFMT_2_10_10_10_UNORM}, // 26 + {FMT_2_10_10_10, TYPE_SNORM, CFMT_2_10_10_10_SNORM}, // 27 + {FMT_2_10_10_10, TYPE_UINT, CFMT_2_10_10_10_UINT}, // 28 + {FMT_2_10_10_10, TYPE_SINT, CFMT_2_10_10_10_SINT}, // 29 + {FMT_2_10_10_10, TYPE_USCALED, CFMT_2_10_10_10_USCALED}, // 30 + {FMT_2_10_10_10, TYPE_SSCALED, CFMT_2_10_10_10_SSCALED}, // 31 + {FMT_24_8, TYPE_UNORM, CFMT_24_8_UNORM}, // 32 + {FMT_24_8, TYPE_UINT, CFMT_24_8_UINT}, // 33 + {FMT_32, TYPE_UINT, CFMT_32_UINT}, // 34 + {FMT_32, TYPE_SINT, CFMT_32_SINT}, // 35 + {FMT_32, TYPE_FLOAT, CFMT_32_FLOAT}, // 36 + {FMT_32_32, TYPE_UINT, CFMT_32_32_UINT}, // 37 + {FMT_32_32, TYPE_SINT, CFMT_32_32_SINT}, // 38 + {FMT_32_32, TYPE_FLOAT, CFMT_32_32_FLOAT}, // 39 + {FMT_32_32_32, TYPE_UINT, CFMT_32_32_32_UINT}, // 40 + {FMT_32_32_32, TYPE_SINT, CFMT_32_32_32_SINT}, // 41 + {FMT_32_32_32, TYPE_FLOAT, CFMT_32_32_32_FLOAT}, // 42 + {FMT_32_32_32_32, TYPE_UINT, CFMT_32_32_32_32_UINT}, // 43 + {FMT_32_32_32_32, TYPE_SINT, CFMT_32_32_32_32_SINT}, // 44 + {FMT_32_32_32_32, TYPE_FLOAT, CFMT_32_32_32_32_FLOAT}, // 45 + {FMT_5_5_5_1, TYPE_UNORM, CFMT_5_5_5_1_UNORM}, // 46 + {FMT_5_6_5, TYPE_UNORM, CFMT_5_6_5_UNORM}, // 47 + {FMT_8, TYPE_UNORM, CFMT_8_UNORM}, // 48 + {FMT_8, TYPE_SNORM, CFMT_8_SNORM}, // 49 + {FMT_8, TYPE_UINT, CFMT_8_UINT}, // 50 + {FMT_8, TYPE_SINT, CFMT_8_SINT}, // 51 + {FMT_8, TYPE_SRGB, CFMT_8_SRGB}, // 52 + {FMT_8, TYPE_USCALED, CFMT_8_USCALED}, // 53 + {FMT_8, TYPE_SSCALED, CFMT_8_SSCALED}, // 54 + {FMT_8_24, TYPE_UNORM, CFMT_8_24_UNORM}, // 55 + {FMT_8_24, TYPE_UINT, CFMT_8_24_UINT}, // 56 + {FMT_8_8, TYPE_UNORM, CFMT_8_8_UNORM}, // 57 + {FMT_8_8, TYPE_SNORM, CFMT_8_8_SNORM}, // 58 + {FMT_8_8, TYPE_UINT, CFMT_8_8_UINT}, // 59 + {FMT_8_8, TYPE_SINT, CFMT_8_8_SINT}, // 60 + {FMT_8_8, TYPE_SRGB, CFMT_8_8_SRGB}, // 61 + {FMT_8_8, TYPE_USCALED, CFMT_8_8_USCALED}, // 62 + {FMT_8_8, TYPE_SSCALED, CFMT_8_8_SSCALED}, // 63 + {FMT_8_8_8_8, TYPE_UNORM, CFMT_8_8_8_8_UNORM}, // 64 + {FMT_8_8_8_8, TYPE_SNORM, CFMT_8_8_8_8_SNORM}, // 65 + {FMT_8_8_8_8, TYPE_UINT, CFMT_8_8_8_8_UINT}, // 66 + {FMT_8_8_8_8, TYPE_SINT, CFMT_8_8_8_8_SINT}, // 67 + {FMT_8_8_8_8, TYPE_SRGB, CFMT_8_8_8_8_SRGB}, // 68 + {FMT_8_8_8_8, TYPE_USCALED, CFMT_8_8_8_8_USCALED}, // 69 + {FMT_8_8_8_8, TYPE_SSCALED, CFMT_8_8_8_8_SSCALED} // 70 +}; +static const int FormatLUTSize = sizeof(FormatLUT)/sizeof(formatconverstion_t); + +//Index in FormatLUT to start search, indexed by FMT enum. +static const int FormatEntryPoint[] = { + 71, // FMT_INVALID + 48, // FMT_8 + 5, // FMT_16 + 57, // FMT_8_8 + 34, // FMT_32 + 12, // FMT_16_16 + 71, // FMT_10_11_11 + 71, // FMT_11_11_10 + 1, // FMT_10_10_10_2 + 26, // FMT_2_10_10_10 + 64, // FMT_8_8_8_8 + 37, // FMT_32_32 + 19, // FMT_16_16_16_16 + 40, // FMT_32_32_32 + 43, // FMT_32_32_32_32 + 71, // RESERVED + 47, // FMT_5_6_5 + 0, // FMT_1_5_5_5 + 46, // FMT_5_5_5_1 + 71, // FMT_4_4_4_4 + 55, // FMT_8_24 + 32 // FMT_24_8 +}; + +static FORMAT GetCombinedFormat(uint8_t fmt, uint8_t type) { + assert(fmt < sizeof(FormatEntryPoint)/sizeof(int) && "FMT out of range."); + int start = FormatEntryPoint[fmt]; + int stop = std::min(start + 6, FormatLUTSize); // Only 6 types are used in image_kv_lut.cpp + + for(int i=start; i> 3) * out.pitch; + size_t slicePitch = rowPitch * out.height; + if (desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB && + image_data_layout == HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR && + ((image_data_row_pitch && (rowPitch != image_data_row_pitch)) || + (image_data_slice_pitch && (slicePitch != image_data_slice_pitch)))) { + return static_cast( + HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED); + } + + image_info.size = out.surfSize; + assert(image_info.size != 0); + image_info.alignment = out.baseAlign; + assert(image_info.alignment != 0); + + return HSA_STATUS_SUCCESS; +} + +bool ImageManagerGfx12::IsLocalMemory(const void* address) const { + return true; +} + +hsa_status_t ImageManagerGfx12::PopulateImageSrd(Image& image, + const metadata_amd_t* descriptor) const { + const metadata_amd_gfx12_t* desc = reinterpret_cast(descriptor); + const void* image_data_addr = image.data; + + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + if ((image_prop.cap == HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED) || + (image_prop.element_size == 0)) + return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED; + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + + if (IsLocalMemory(image.data)) { + image_data_addr = reinterpret_cast( + reinterpret_cast(image.data) - local_memory_base_address_); + } + + image.srd[0] = desc->word0.u32All; + image.srd[1] = desc->word1.u32All; + image.srd[2] = desc->word2.u32All; + image.srd[3] = desc->word3.u32All; + image.srd[4] = desc->word4.u32All; + image.srd[5] = desc->word5.u32All; + image.srd[6] = desc->word6.u32All; + image.srd[7] = desc->word7.u32All; + + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + SQ_BUF_RSRC_WORD0 word0; + SQ_BUF_RSRC_WORD1 word1; + SQ_BUF_RSRC_WORD3 word3; + + word0.val = 0; + word0.f.BASE_ADDRESS = PtrLow32(image_data_addr); + + word1.val = image.srd[1]; + word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr); + word1.f.STRIDE = image_prop.element_size; + + word3.val = image.srd[3]; + word3.f.DST_SEL_X = swizzle.x; + word3.f.DST_SEL_Y = swizzle.y; + word3.f.DST_SEL_Z = swizzle.z; + word3.f.DST_SEL_W = swizzle.w; + + word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + + word3.f.INDEX_STRIDE = image_prop.element_size; + + // New to GFX12 + //word3.f.WRITE_COMPRESS_ENABLE = 0; + //word3.f.COMPRESSION_EN = 0; + //word3.f.COMPRESSION_ACCESS_MODE = 0; + + image.srd[0] = word0.val; + image.srd[1] = word1.val; + image.srd[3] = word3.val; + } else { + uint32_t hwPixelSize = ImageLut().GetPixelSize(image_prop.data_format, image_prop.data_type); + + if (image_prop.element_size != hwPixelSize) { + return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED; + } + reinterpret_cast(&image.srd[0])->bits.BASE_ADDRESS = + PtrLow40Shift8(image_data_addr); + reinterpret_cast(&image.srd[1])->bits.BASE_ADDRESS_HI = + PtrHigh64Shift40(image_data_addr); + + // New to GFX12... + //reinterpret_cast(&image.srd[1])->bits.MAX_MIP = 0; + + reinterpret_cast(&image.srd[1])->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + reinterpret_cast(&image.srd[3])->bits.DST_SEL_X = + swizzle.x; + reinterpret_cast(&image.srd[3])->bits.DST_SEL_Y = + swizzle.y; + reinterpret_cast(&image.srd[3])->bits.DST_SEL_Z = + swizzle.z; + reinterpret_cast(&image.srd[3])->bits.DST_SEL_W = + swizzle.w; + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA || + image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1D) { + reinterpret_cast(&image.srd[3])->bits.TYPE = + ImageLut().MapGeometry(image.desc.geometry); + } + } + + // Looks like this is only used for CPU copies. + image.row_pitch = 0; + image.slice_pitch = 0; + + // Used by HSAIL shader ABI + image.srd[8] = image.desc.format.channel_type; + image.srd[9] = image.desc.format.channel_order; + image.srd[10] = static_cast(image.desc.width); + + return HSA_STATUS_SUCCESS; +} + +static TEX_BC_SWIZZLE GetBcSwizzle(const Swizzle& swizzle) { + SEL r = (SEL)swizzle.x; + SEL g = (SEL)swizzle.y; + SEL b = (SEL)swizzle.z; + SEL a = (SEL)swizzle.w; + + TEX_BC_SWIZZLE bcSwizzle = TEX_BC_Swizzle_XYZW; + + if (a == SEL_X) { + // Have to use either TEX_BC_Swizzle_WZYX or TEX_BC_Swizzle_WXYZ + // + // For the pre-defined border color values (white, opaque black, + // transparent black), the only thing that matters is that the alpha + // channel winds up in the correct place (because the RGB channels are + // all the same) so either of these TEX_BC_Swizzle enumerations will + // work. Not sure what happens with border color palettes. + if (b == SEL_Y) { + // ABGR + bcSwizzle = TEX_BC_Swizzle_WZYX; + } else if ((r == SEL_X) && (g == SEL_X) && (b == SEL_X)) { + // RGBA + bcSwizzle = TEX_BC_Swizzle_XYZW; + } else { + // ARGB + bcSwizzle = TEX_BC_Swizzle_WXYZ; + } + } else if (r == SEL_X) { + // Have to use either TEX_BC_Swizzle_XYZW or TEX_BC_Swizzle_XWYZ + if (g == SEL_Y) { + // RGBA + bcSwizzle = TEX_BC_Swizzle_XYZW; + } else if ((g == SEL_X) && (b == SEL_X) && (a == SEL_W)) { + // RGBA + bcSwizzle = TEX_BC_Swizzle_XYZW; + } else { + // RAGB + bcSwizzle = TEX_BC_Swizzle_XWYZ; + } + } else if (g == SEL_X) { + // GRAB, have to use TEX_BC_Swizzle_YXWZ + bcSwizzle = TEX_BC_Swizzle_YXWZ; + } else if (b == SEL_X) { + // BGRA, have to use TEX_BC_Swizzle_ZYXW + bcSwizzle = TEX_BC_Swizzle_ZYXW; + } + + return bcSwizzle; +} + + +hsa_status_t ImageManagerGfx12::PopulateImageSrd(Image& image) const { + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED); + assert(image_prop.element_size != 0); + + const void* image_data_addr = image.data; + + if (IsLocalMemory(image.data)) + image_data_addr = reinterpret_cast( + reinterpret_cast(image.data) - local_memory_base_address_); + + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + SQ_BUF_RSRC_WORD0 word0; + SQ_BUF_RSRC_WORD1 word1; + SQ_BUF_RSRC_WORD2 word2; + SQ_BUF_RSRC_WORD3 word3; + + word0.val = 0; + word0.f.BASE_ADDRESS = PtrLow32(image_data_addr); + + word1.val = 0; + word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr); + word1.f.STRIDE = image_prop.element_size; + + word1.f.SWIZZLE_ENABLE = 0; + + word2.f.NUM_RECORDS = image.desc.width * image_prop.element_size; + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + word3.val = 0; + word3.f.DST_SEL_X = swizzle.x; + word3.f.DST_SEL_Y = swizzle.y; + word3.f.DST_SEL_Z = swizzle.z; + word3.f.DST_SEL_W = swizzle.w; + word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + + word3.f.INDEX_STRIDE = image_prop.element_size; + + // New to GFX12 + //word3.f.WRITE_COMPRESS_ENABLE = 0; + //word3.f.COMPRESSION_EN = 0; + //word3.f.COMPRESSION_ACCESS_MODE = 0; + + word3.f.TYPE = ImageLut().MapGeometry(image.desc.geometry); + + image.srd[0] = word0.val; + image.srd[1] = word1.val; + image.srd[2] = word2.val; + image.srd[3] = word3.val; + + image.row_pitch = image.desc.width * image_prop.element_size; + image.slice_pitch = image.row_pitch; + } else { + SQ_IMG_RSRC_WORD0 word0; + SQ_IMG_RSRC_WORD1 word1; + SQ_IMG_RSRC_WORD2 word2; + SQ_IMG_RSRC_WORD3 word3; + SQ_IMG_RSRC_WORD4 word4; + SQ_IMG_RSRC_WORD5 word5; + SQ_IMG_RSRC_WORD5 word6; + SQ_IMG_RSRC_WORD5 word7; + + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0}; + + uint32_t swizzleMode = GetAddrlibSurfaceInfoNv( + image.component, image.desc, image.tile_mode, + image.row_pitch, image.slice_pitch, out); + if (swizzleMode == (uint32_t)(-1)) { + return HSA_STATUS_ERROR; + } + + assert((out.bpp / 8) == image_prop.element_size); + + const size_t row_pitch_size = out.pitch * image_prop.element_size; + + word0.f.BASE_ADDRESS = PtrLow40Shift8(image_data_addr); + + word1.val = 0; + word1.f.BASE_ADDRESS_HI = PtrHigh64Shift40(image_data_addr); + + // New to GFX12 + //word1.f.MAX_MIP = 0; + //word1.f.BASE_LEVEL = 0; + + word1.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + // Only take the lowest 2 bits of (image.desc.width - 1) + word1.f.WIDTH = BitSelect<0, 1>(image.desc.width - 1); + + word2.val = 0; + // Take the high 14 bits of (image.desc.width - 1) + word2.f.WIDTH_HI = BitSelect<2, 15>(image.desc.width - 1); + word2.f.HEIGHT = image.desc.height ? image.desc.height - 1 : 0; + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + word3.val = 0; + word3.f.DST_SEL_X = swizzle.x; + word3.f.DST_SEL_Y = swizzle.y; + word3.f.DST_SEL_Z = swizzle.z; + word3.f.DST_SEL_W = swizzle.w; + //word3.f.NO_EDGE_CLAMP = 0; // New to GFX12 + //word3.f.LAST_LEVEL = 0; // New to GFX12 + word3.f.SW_MODE = swizzleMode; + word3.f.BC_SWIZZLE = GetBcSwizzle(swizzle); + word3.f.TYPE = ImageLut().MapGeometry(image.desc.geometry); + + const bool image_array = + (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA || + image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DA || + image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DADEPTH); + const bool image_3d = (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_3D); + + word4.val = 0; + word4.f.DEPTH = + (image_array) // Doesn't hurt but isn't array_size already >0? + ? std::max(image.desc.array_size, static_cast(1)) - 1 + : (image_3d) ? image.desc.depth - 1 : 0; + + // For 1d, 2d and 2d-msaa this is pitch-1 + if (!image_array && !image_3d) { + uint32_t encPitch = out.pitch - 1; + word4.f.DEPTH = encPitch & 0x1fff; // 13 bits + word4.f.PITCH_MSB = (encPitch >> 13) & 0x3; // last 2 bits + } + + word5.val = 0; + word6.val = 0; + word7.val = 0; + + image.srd[0] = word0.val; + image.srd[1] = word1.val; + image.srd[2] = word2.val; + image.srd[3] = word3.val; + image.srd[4] = word4.val; + image.srd[5] = word5.val; + image.srd[6] = word6.val; + image.srd[7] = word7.val; + + image.row_pitch = row_pitch_size; + image.slice_pitch = out.sliceSize; + } + + image.srd[8] = image.desc.format.channel_type; + image.srd[9] = image.desc.format.channel_order; + image.srd[10] = static_cast(image.desc.width); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ImageManagerGfx12::ModifyImageSrd( + Image& image, hsa_ext_image_format_t& new_format) const { + image.desc.format = new_format; + + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED); + assert(image_prop.element_size != 0); + + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + SQ_BUF_RSRC_WORD3* word3 = + reinterpret_cast(&image.srd[3]); + word3->bits.DST_SEL_X = swizzle.x; + word3->bits.DST_SEL_Y = swizzle.y; + word3->bits.DST_SEL_Z = swizzle.z; + word3->bits.DST_SEL_W = swizzle.w; + word3->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + } else { + SQ_IMG_RSRC_WORD1* word1 = + reinterpret_cast(&image.srd[1]); + word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + SQ_IMG_RSRC_WORD3* word3 = + reinterpret_cast(&image.srd[3]); + word3->bits.DST_SEL_X = swizzle.x; + word3->bits.DST_SEL_Y = swizzle.y; + word3->bits.DST_SEL_Z = swizzle.z; + word3->bits.DST_SEL_W = swizzle.w; + } + + image.srd[8] = image.desc.format.channel_type; + image.srd[9] = image.desc.format.channel_order; + image.srd[10] = static_cast(image.desc.width); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ImageManagerGfx12::PopulateSamplerSrd(Sampler& sampler) const { + const hsa_ext_sampler_descriptor_t sampler_descriptor = sampler.desc; + + SQ_IMG_SAMP_WORD0 word0; + SQ_IMG_SAMP_WORD1 word1; + SQ_IMG_SAMP_WORD2 word2; + SQ_IMG_SAMP_WORD3 word3; + + word0.u32All = 0; + switch (sampler_descriptor.address_mode) { + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + word0.bits.CLAMP_X = static_cast(SQ_TEX_CLAMP_LAST_TEXEL); + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: + word0.bits.CLAMP_X = static_cast(SQ_TEX_CLAMP_BORDER); + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + word0.bits.CLAMP_X = static_cast(SQ_TEX_MIRROR); + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: + case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: + word0.bits.CLAMP_X = static_cast(SQ_TEX_WRAP); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + word0.bits.CLAMP_Y = word0.bits.CLAMP_X; + word0.bits.CLAMP_Z = word0.bits.CLAMP_X; + word0.bits.FORCE_UNNORMALIZED = (sampler_descriptor.coordinate_mode == + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED); + + word1.u32All = 0; + word1.bits.MAX_LOD = 4095; + + word2.u32All = 0; + switch (sampler_descriptor.filter_mode) { + case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: + word2.bits.XY_MAG_FILTER = static_cast(SQ_TEX_XY_FILTER_POINT); + break; + case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: + word2.bits.XY_MAG_FILTER = static_cast(SQ_TEX_XY_FILTER_BILINEAR); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + word2.bits.XY_MIN_FILTER = word2.bits.XY_MAG_FILTER; + word2.bits.Z_FILTER = SQ_TEX_Z_FILTER_NONE; + word2.bits.MIP_FILTER = SQ_TEX_MIP_FILTER_NONE; + + word3.u32All = 0; + + // TODO: check this bit with HSAIL spec. + word3.bits.BORDER_COLOR_TYPE = SQ_TEX_BORDER_COLOR_TRANS_BLACK; + + sampler.srd[0] = word0.u32All; + sampler.srd[1] = word1.u32All; + sampler.srd[2] = word2.u32All; + sampler.srd[3] = word3.u32All; + + return HSA_STATUS_SUCCESS; +} + +uint32_t ImageManagerGfx12::GetAddrlibSurfaceInfoNv( + hsa_agent_t component, const hsa_ext_image_descriptor_t& desc, + Image::TileMode tileMode, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT& out) const { + const ImageProperty image_prop = + GetImageProperty(component, desc.format, desc.geometry); + + const AddrFormat addrlib_format = GetAddrlibFormat(image_prop); + + const uint32_t width = static_cast(desc.width); + const uint32_t height = static_cast(desc.height); + static const size_t kMinNumSlice = 1; + const uint32_t num_slice = static_cast( + std::max(kMinNumSlice, std::max(desc.array_size, desc.depth))); + + ADDR3_COMPUTE_SURFACE_INFO_INPUT in = {0}; + in.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_INPUT); + in.format = addrlib_format; + in.bpp = static_cast(image_prop.element_size) * 8; + in.width = width; + in.height = height; + in.numSlices = num_slice; + in.pitchInElement = image_data_row_pitch / image_prop.element_size; + + switch (desc.geometry) { + case HSA_EXT_IMAGE_GEOMETRY_1D: + case HSA_EXT_IMAGE_GEOMETRY_1DB: + case HSA_EXT_IMAGE_GEOMETRY_1DA: + in.resourceType = ADDR_RSRC_TEX_1D; + break; + + case HSA_EXT_IMAGE_GEOMETRY_2D: + case HSA_EXT_IMAGE_GEOMETRY_2DDEPTH: + case HSA_EXT_IMAGE_GEOMETRY_2DA: + case HSA_EXT_IMAGE_GEOMETRY_2DADEPTH: + in.resourceType = ADDR_RSRC_TEX_2D; + break; + + case HSA_EXT_IMAGE_GEOMETRY_3D: + in.resourceType = ADDR_RSRC_TEX_3D; + break; + } + in.flags.texture = 1; + + if (tileMode == Image::TileMode::LINEAR) + { + in.swizzleMode = ADDR3_LINEAR; + } else { + + /* + * AddrLib3 does not provide the best swizzle mode (unlike AddrLib2). + * Instead, client has to request the list of possible swizzle mode and + * then pick the best one for its needs (i.e. performance/space tradeoffs). + * + */ + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT swOut = { 0 }; + swOut.size = sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT); + + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT swIn = { 0 }; + swIn.size = sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT); + swIn.flags = in.flags; + swIn.resourceType = in.resourceType; + swIn.bpp = in.bpp; + swIn.width = in.width; + swIn.height = in.height; + swIn.numSlices = in.numSlices; + swIn.numMipLevels = in.numMipLevels; + swIn.numSamples = in.numSamples; + /* + * Cannot leave it to 0 like GFX11 Addr2GetPreferredSurfaceSetting method + * as it triggers an ASSERT in AddrLib3 code. + * + * Setting it to 256K to allow for maximum number of swizzle mode in set + * returned (similar behaviour as GFX11). + * + */ + swIn.maxAlign = 256 * 1024; + + + if (ADDR_OK != Addr3GetPossibleSwizzleModes(addr_lib_, &swIn, &swOut)) { + debug_print("Addr3GetPossibleSwizzleModes failed!\n"); + return (uint32_t) -1; + } + + /* + * Remove any modes that the client does not want (if any). + */ + //swOut.validModes.sw***** = 0; + + + /* + * Pick the "best" swizzle mode. + * + * This algorithm is based on behaviour in GFX11 AddrLib and on + * GFX12 code in PAL (that is also based on the GFX11 behaviour). + * + * Ratio variables control the extra space that can be used to get a larger + * swizzle mode. + * + * ratioLow:ratioHi meanings: + * + * 2:1 ratio - same behaviour as GFX11. + * 3:2 ratio - would be equivalent if flag opt4space in GFX11 (not used in ROCr) + * 1:1 ratio - minimum size, not necessary best for performance + * + */ + const UINT_32 ratioLow = 2; + const UINT_32 ratioHigh = 1; + + // Same behaviour as GFX11, remove linear if height is 1. + if (in.height > 1) { + swOut.validModes.swLinear = 0; + } + + UINT_64 minSize = 0; + Addr3SwizzleMode bestSwizzle = ADDR3_MAX_TYPE; + + for (uint32_t i = ADDR3_LINEAR; i < ADDR3_MAX_TYPE; i++) { + + if (swOut.validModes.value & (1 << i)) { + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0}; + localOut.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT); + + in.swizzleMode = (Addr3SwizzleMode) i; + + if (ADDR_OK != Addr3ComputeSurfaceInfo(addr_lib_, &in, &localOut)) { + // Should not happen, if it does, ignore this swizzle mode. + debug_print("Addr3ComputeSurfaceInfo failed!\n"); + continue; + } + + UINT_64 surfaceSize = localOut.surfSize; + + if (bestSwizzle == ADDR3_MAX_TYPE) { + minSize = surfaceSize; + bestSwizzle = (Addr3SwizzleMode) i; + } else if ((surfaceSize * ratioHigh) <= (minSize * ratioLow)) { + bestSwizzle = (Addr3SwizzleMode) i; + } + } + } + + if (bestSwizzle < ADDR3_MAX_TYPE) { + in.swizzleMode = (Addr3SwizzleMode) bestSwizzle; + } else { + debug_print("Unable to find a valid swizzleMode for the surface!\n"); + return (uint32_t) -1; + } + } + + + out.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT); + + if (ADDR_OK != Addr3ComputeSurfaceInfo(addr_lib_, &in, &out)) { + return (uint32_t)(-1); + } + if (out.surfSize == 0) { + return (uint32_t)(-1); + } + + return in.swizzleMode; +} + +hsa_status_t ImageManagerGfx12::FillImage(const Image& image, const void* pattern, + const hsa_ext_image_region_t& region) { + if (BlitQueueInit().queue_ == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + Image* image_view = const_cast(&image); + + SQ_BUF_RSRC_WORD3* word3_buff = NULL; + SQ_IMG_RSRC_WORD3* word3_image = NULL; + uint32_t dst_sel_w_original = 0; + if (image_view->desc.format.channel_type == + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010) { + // Force GPU to ignore the last two bits (alpha bits). + if (image_view->desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + word3_buff = reinterpret_cast(&image_view->srd[3]); + dst_sel_w_original = word3_buff->bits.DST_SEL_W; + word3_buff->bits.DST_SEL_W = SEL_0; + } else { + word3_image = reinterpret_cast(&image_view->srd[3]); + dst_sel_w_original = word3_image->bits.DST_SEL_W; + word3_image->bits.DST_SEL_W = SEL_0; + } + } + + SQ_IMG_RSRC_WORD1* word1 = NULL; + uint32_t num_format_original = 0; + const void* new_pattern = pattern; + float fill_value[4] = {0}; + switch (image_view->desc.format.channel_order) { + case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA: + case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB: + case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX: + case HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA: { + // We do not have write support for SRGBA image, so convert pattern + // to standard form and treat the image as RGBA image. + const float* pattern_f = reinterpret_cast(pattern); + fill_value[0] = LinearToStandardRGB(pattern_f[0]); + fill_value[1] = LinearToStandardRGB(pattern_f[1]); + fill_value[2] = LinearToStandardRGB(pattern_f[2]); + fill_value[3] = pattern_f[3]; + new_pattern = fill_value; + + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + + word1 = reinterpret_cast(&image_view->srd[1]); + num_format_original = word1->bits.FORMAT; + word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, TYPE_UNORM); + } break; + default: + break; + } + + hsa_status_t status = ImageRuntime::instance()->blit_kernel().FillImage( + blit_queue_, blit_code_catalog_, *image_view, new_pattern, region); + + // Revert back original configuration. + if (word3_buff != NULL) { + word3_buff->bits.DST_SEL_W = dst_sel_w_original; + } + + if (word3_image != NULL) { + word3_image->bits.DST_SEL_W = dst_sel_w_original; + } + + if (word1 != NULL) { + word1->bits.FORMAT = num_format_original; + } + + return status; +} + +} // namespace image +} // namespace rocr diff --git a/runtime/hsa-runtime/image/image_manager_gfx12.h b/runtime/hsa-runtime/image/image_manager_gfx12.h new file mode 100755 index 0000000000..085dee9c94 --- /dev/null +++ b/runtime/hsa-runtime/image/image_manager_gfx12.h @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef EXT_IMAGE_IMAGE_MANAGER_GFX12_H_ +#define EXT_IMAGE_IMAGE_MANAGER_GFX12_H_ + +#include "addrlib/inc/addrinterface.h" +#include "image_lut_gfx11.h" +#include "image_manager_kv.h" + +namespace rocr { +namespace image { + +class ImageManagerGfx12 : public ImageManagerKv { + public: + ImageManagerGfx12(); + virtual ~ImageManagerGfx12(); + + /// @brief Calculate the size and alignment of the backing storage of an + /// image. + virtual hsa_status_t CalculateImageSizeAndAlignment( + hsa_agent_t component, const hsa_ext_image_descriptor_t& desc, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, size_t image_data_slice_pitch, + hsa_ext_image_data_info_t& image_info) const; + + /// @brief Fill image structure with device specific image object. + virtual hsa_status_t PopulateImageSrd(Image& image) const; + + /// @brief Fill image structure with device specific image object using the given format. + virtual hsa_status_t PopulateImageSrd(Image& image, const metadata_amd_t* desc) const; + + /// @brief Modify device specific image object according to the specified + /// new format. + virtual hsa_status_t ModifyImageSrd(Image& image, + hsa_ext_image_format_t& new_format) const; + + /// @brief Fill sampler structure with device specific sampler object. + virtual hsa_status_t PopulateSamplerSrd(Sampler& sampler) const; + + /// @brief Fill image backing storage using agent copy. + virtual hsa_status_t FillImage(const Image& image, const void* pattern, + const hsa_ext_image_region_t& region); + protected: + uint32_t GetAddrlibSurfaceInfoNv(hsa_agent_t component, + const hsa_ext_image_descriptor_t& desc, + Image::TileMode tileMode, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT& out) const; + + bool IsLocalMemory(const void* address) const; + virtual const ImageLutGfx11& ImageLut() const { return image_lut_gfx11; }; + + private: + ImageLutGfx11 image_lut_gfx11; + DISALLOW_COPY_AND_ASSIGN(ImageManagerGfx12); +}; + +} // namespace image +} // namespace rocr +#endif // EXT_IMAGE_IMAGE_MANAGER_GFX12_H_ diff --git a/runtime/hsa-runtime/image/image_runtime.cpp b/runtime/hsa-runtime/image/image_runtime.cpp index dc4109cd14..3e015be94b 100644 --- a/runtime/hsa-runtime/image/image_runtime.cpp +++ b/runtime/hsa-runtime/image/image_runtime.cpp @@ -55,6 +55,7 @@ #include "image_manager_ai.h" #include "image_manager_nv.h" #include "image_manager_gfx11.h" +#include "image_manager_gfx12.h" #include "device_info.h" namespace rocr { @@ -110,14 +111,22 @@ hsa_status_t ImageRuntime::CreateImageManager(hsa_agent_t agent, void* data) { ImageManager* image_manager; - if (major_ver >= 11) { + switch (major_ver) { + case 12: + image_manager = new ImageManagerGfx12(); + break; + case 11: image_manager = new ImageManagerGfx11(); - } else if (major_ver >= 10) { + break; + case 10: image_manager = new ImageManagerNv(); - } else if (major_ver >= 9) { + break; + case 9: image_manager = new ImageManagerAi(); - } else { + break; + default: image_manager = new ImageManagerKv(); + break; } hsa_error_code = image_manager->Initialize(agent);