Files
rocm-systems/runtime/hsa-runtime/image/blit_kernel.cpp
T
Sean Keely 7e3db20826 Move Images code to hsa-runtime folder
Change-Id: I53c1845d985ac3e9708d952865009c0021f3bb4f
2020-04-30 19:35:57 -05:00

976 lines
33 KiB
C++

#include "blit_kernel.h"
#if (defined(WIN32) || defined(_WIN32))
#define NOMINMAX
#endif
#include <algorithm>
#include <atomic>
#include <sstream>
#include <string>
#include "image_manager.h"
#include "image_runtime.h"
#include "util.h"
#undef HSA_ARGUMENT_ALIGN_BYTES
#define HSA_ARGUMENT_ALIGN_BYTES 16
#include "core/inc/hsa_table_interface.h"
extern uint8_t blit_object_gfx7xx[14608];
extern uint8_t blit_object_gfx8xx[15424];
extern uint8_t blit_object_gfx9xx[15432];
extern uint8_t ocl_blit_object_gfx700[];
extern uint8_t ocl_blit_object_gfx701[];
extern uint8_t ocl_blit_object_gfx702[];
extern uint8_t ocl_blit_object_gfx801[];
extern uint8_t ocl_blit_object_gfx802[];
extern uint8_t ocl_blit_object_gfx803[];
extern uint8_t ocl_blit_object_gfx900[];
extern uint8_t ocl_blit_object_gfx902[];
extern uint8_t ocl_blit_object_gfx904[];
extern uint8_t ocl_blit_object_gfx906[];
extern uint8_t ocl_blit_object_gfx908[];
extern uint8_t ocl_blit_object_gfx1010[];
extern uint8_t ocl_blit_object_gfx1011[];
extern uint8_t ocl_blit_object_gfx1012[];
namespace amd {
// Arguments inserted by OCL compiler, all zero here.
struct OCLHiddenArgs {
uint64_t offset_x;
uint64_t offset_y;
uint64_t offset_z;
void* printf_buffer;
void* enqueue;
void* enqueue2;
void* multi_grid;
};
static void* Allocate(hsa_agent_t agent, size_t size) {
//use the host accessible kernarg pool
hsa_amd_memory_pool_t pool = ext_image::ImageRuntime::instance()->kernarg_pool();
void* ptr = NULL;
hsa_status_t status = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
assert(status == HSA_STATUS_SUCCESS);
if (status != HSA_STATUS_SUCCESS) return NULL;
status = hsa_amd_agents_allow_access(1, &agent, NULL, ptr);
assert(status == HSA_STATUS_SUCCESS);
if (status != HSA_STATUS_SUCCESS) {
hsa_amd_memory_pool_free(ptr);
return NULL;
}
return ptr;
}
BlitKernel::BlitKernel() {
}
BlitKernel::~BlitKernel() {}
hsa_status_t BlitKernel::Initialize() { return HSA_STATUS_SUCCESS; }
hsa_status_t BlitKernel::Cleanup() {
for (std::pair<const uint64_t, hsa_executable_t> pair :
code_executable_map_) {
hsa_executable_destroy(pair.second);
}
code_executable_map_.clear();
code_object_map_.clear();
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitKernel::BuildBlitCode(
hsa_agent_t agent, std::vector<BlitCodeInfo>& blit_code_catalog) {
// Find existing kernels in the list that have compatible ISA.
hsa_isa_t agent_isa = {0};
hsa_status_t status =
hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &agent_isa);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
std::lock_guard<std::mutex> lock(lock_);
for (std::pair<uint64_t, hsa_executable_t> pair : code_executable_map_) {
bool isa_compatible = false;
hsa_isa_t code_isa = {pair.first};
status = hsa_isa_compatible(code_isa, agent_isa, &isa_compatible);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
if (isa_compatible) {
return PopulateKernelCode(agent, pair.second, blit_code_catalog);
}
}
// No existing compatible kernels. Build new kernels.
hsa_code_object_t code_object = {0};
// Get the target name
char agent_name[64] = {0};
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, &agent_name);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
// Get the patched code object
uint8_t* patched_code_object;
status = BlitKernel::GetPatchedBlitObject(agent_name, &patched_code_object);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
// Pass the patched code object
code_object.handle = reinterpret_cast<uint64_t>(patched_code_object);
code_object_map_[agent_isa.handle] = code_object;
// Create executable.
hsa_executable_t executable = {0};
status = hsa_executable_create(
HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
code_executable_map_[agent_isa.handle] = executable;
// Load code object.
status = hsa_executable_load_code_object(executable, agent, code_object, "");
if (HSA_STATUS_SUCCESS != status) {
return status;
}
// Freeze executable.
status = hsa_executable_freeze(executable, "");
if (HSA_STATUS_SUCCESS != status) {
return status;
}
return PopulateKernelCode(agent, executable, blit_code_catalog);
}
hsa_status_t BlitKernel::CopyBufferToImage(
BlitQueue& blit_queue, const std::vector<BlitCodeInfo>& blit_code_catalog,
const void* src_memory, size_t src_row_pitch, size_t src_slice_pitch,
const Image& dst_image, const hsa_ext_image_region_t& image_region) {
if (dst_image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
ImageManager* manager =
ext_image::ImageRuntime::instance()->image_manager(dst_image.component);
const uint32_t element_size =
manager->GetImageProperty(dst_image.component, dst_image.desc.format,
dst_image.desc.geometry).element_size;
const size_t dst_origin = image_region.offset.x * element_size;
char* dst_memory = reinterpret_cast<char*>(dst_image.data) + dst_origin;
const size_t size = image_region.range.x * element_size;
return hsa_memory_copy(dst_memory, src_memory, size);
}
const Image* dst_image_view = NULL;
hsa_status_t status = ConvertImage(dst_image, &dst_image_view);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
assert(dst_image_view != NULL);
hsa_kernel_dispatch_packet_t packet = {0};
const BlitCodeInfo& blit_code =
blit_code_catalog.at(KERNEL_OP_COPY_BUFFER_TO_IMAGE);
packet.kernel_object = blit_code.code_handle_;
packet.group_segment_size = blit_code.group_segment_size_;
packet.private_segment_size = blit_code.private_segment_size_;
// Setup kernel argument.
/*
buffer is start of output pixel in destination buffer
format.x is element count
format.y is element size
format.z is max(dword per pixel, 1)
format.w is texture type.
pixelOrigin is start pixel address.
*/
struct KernelArgs {
const void* buffer;
uint64_t image[5];
int32_t pixelOrigin[4];
uint32_t format[4];
uint64_t pitch;
uint64_t slice_pitch;
OCLHiddenArgs ocl;
};
KernelArgs* args = (KernelArgs*)Allocate(dst_image_view->component, sizeof(KernelArgs));
assert(args != NULL);
memset(args, 0, sizeof(KernelArgs));
args->buffer = src_memory;
for(auto& img : args->image)
img = dst_image_view->Convert();
args->pixelOrigin[0] = image_region.offset.x;
args->pixelOrigin[1] = image_region.offset.y;
args->pixelOrigin[2] = image_region.offset.z;
ImageManager* manager = ext_image::ImageRuntime::instance()->image_manager(
dst_image_view->component);
const uint32_t element_size =
manager->GetImageProperty(dst_image_view->component,
dst_image_view->desc.format,
dst_image_view->desc.geometry).element_size;
// Try to minimize the read operation to buffer by reading the buffer
// up to one DWORD at a time.
uint32_t buffer_read_count = element_size / sizeof(uint32_t);
buffer_read_count = (buffer_read_count == 0) ? 1 : buffer_read_count;
const uint32_t num_channel = GetNumChannel(*dst_image_view);
const uint32_t size_per_channel = element_size / num_channel;
args->format[0] = num_channel;
args->format[1] = size_per_channel;
args->format[2] = buffer_read_count;
args->format[3] = dst_image_view->desc.geometry;
unsigned long buffer_pitch[2] = {0, 0};
CalcBufferRowSlicePitchesInPixel(dst_image_view->desc.geometry, element_size,
image_region.range, src_row_pitch,
src_slice_pitch, buffer_pitch);
args->pitch = buffer_pitch[0];
args->slice_pitch = buffer_pitch[1];
packet.kernarg_address = args;
// Setup packet dimension and working size.
CalcWorkingSize(*dst_image_view, image_region.range, packet);
status = LaunchKernel(blit_queue, packet);
if (&dst_image != dst_image_view) {
Image::Destroy(dst_image_view);
}
hsa_amd_memory_pool_free(args);
return status;
}
hsa_status_t BlitKernel::CopyImageToBuffer(
BlitQueue& blit_queue, const std::vector<BlitCodeInfo>& blit_code_catalog,
const Image& src_image, void* dst_memory, size_t dst_row_pitch,
size_t dst_slice_pitch, const hsa_ext_image_region_t& image_region) {
if (src_image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
ImageManager* manager =
ext_image::ImageRuntime::instance()->image_manager(src_image.component);
const uint32_t element_size =
manager->GetImageProperty(src_image.component, src_image.desc.format,
src_image.desc.geometry).element_size;
const size_t src_origin = image_region.offset.x * element_size;
const char* src_memory =
reinterpret_cast<const char*>(src_image.data) + src_origin;
const size_t size = image_region.range.x * element_size;
return hsa_memory_copy(dst_memory, src_memory, size);
}
const Image* src_image_view = NULL;
hsa_status_t status = ConvertImage(src_image, &src_image_view);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
assert(src_image_view != NULL);
hsa_kernel_dispatch_packet_t packet = {0};
const BlitCodeInfo& blit_code =
blit_code_catalog.at(KERNEL_OP_COPY_IMAGE_TO_BUFFER);
packet.kernel_object = blit_code.code_handle_;
packet.group_segment_size = blit_code.group_segment_size_;
packet.private_segment_size = blit_code.private_segment_size_;
// Setup kernel argument.
/*
buffer is start of output pixel in destination buffer
format.x is element count
format.y is element size
format.z is max(dword per pixel, 1)
format.w is texture type.
pixelOrigin is start pixel address.
*/
struct KernelArgs {
uint64_t image[5];
void* buffer;
int32_t pixelOrigin[4];
uint32_t format[4];
uint64_t pitch;
uint64_t slice_pitch;
OCLHiddenArgs ocl;
};
KernelArgs* args = (KernelArgs*)Allocate(src_image_view->component, sizeof(KernelArgs));
assert(args != NULL);
memset(args, 0, sizeof(KernelArgs));
for(auto &img : args->image)
img = src_image_view->Convert();
args->buffer = dst_memory;
args->pixelOrigin[0] = image_region.offset.x;
args->pixelOrigin[1] = image_region.offset.y;
args->pixelOrigin[2] = image_region.offset.z;
ImageManager* manager = ext_image::ImageRuntime::instance()->image_manager(
src_image_view->component);
const uint32_t element_size =
manager->GetImageProperty(src_image_view->component,
src_image_view->desc.format,
src_image_view->desc.geometry).element_size;
// Try to minimize the write operation to buffer by reading the buffer
// up to one DWORD at a time.
uint32_t buffer_write_count = element_size / sizeof(uint32_t);
buffer_write_count = (buffer_write_count == 0) ? 1 : buffer_write_count;
const uint32_t num_channel = GetNumChannel(*src_image_view);
const uint32_t size_per_channel = element_size / num_channel;
args->format[0] = num_channel;
args->format[1] = size_per_channel;
args->format[2] = buffer_write_count;
args->format[3] = src_image_view->desc.geometry;
unsigned long buffer_pitch[2] = {0, 0};
CalcBufferRowSlicePitchesInPixel(src_image_view->desc.geometry, element_size,
image_region.range, dst_row_pitch,
dst_slice_pitch, buffer_pitch);
args->pitch = buffer_pitch[0];
args->slice_pitch = buffer_pitch[1];
packet.kernarg_address = args;
// Setup packet dimension and working size.
CalcWorkingSize(*src_image_view, image_region.range, packet);
status = LaunchKernel(blit_queue, packet);
if (&src_image != src_image_view) {
Image::Destroy(src_image_view);
}
hsa_amd_memory_pool_free(args);
return status;
}
hsa_status_t BlitKernel::CopyImage(
BlitQueue& blit_queue, const std::vector<BlitCodeInfo>& blit_code_catalog,
const Image& dst_image, const Image& src_image,
const hsa_dim3_t& dst_origin, const hsa_dim3_t& src_origin,
const hsa_dim3_t size, KernelOp copy_type) {
assert(src_image.component.handle == dst_image.component.handle);
const Image* src_image_view = &src_image;
const Image* dst_image_view = &dst_image;
const BlitCodeInfo* blit_code = NULL;
if (copy_type == KERNEL_OP_COPY_IMAGE_DEFAULT) {
// Linear to linear image copy.
hsa_status_t status = ConvertImage(src_image, &src_image_view);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
assert(src_image_view != NULL);
status = ConvertImage(dst_image, &dst_image_view);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
assert(dst_image_view != NULL);
const hsa_ext_image_geometry_t src_geometry = src_image_view->desc.geometry;
const hsa_ext_image_geometry_t dst_geometry = dst_image_view->desc.geometry;
if (src_geometry != HSA_EXT_IMAGE_GEOMETRY_1DB &&
dst_geometry != HSA_EXT_IMAGE_GEOMETRY_1DB) {
blit_code = &blit_code_catalog.at(KERNEL_OP_COPY_IMAGE_DEFAULT);
} else if (src_geometry == HSA_EXT_IMAGE_GEOMETRY_1DB &&
dst_geometry != HSA_EXT_IMAGE_GEOMETRY_1DB) {
blit_code = &blit_code_catalog.at(KERNEL_OP_COPY_IMAGE_1DB_TO_REG);
} else if (src_geometry != HSA_EXT_IMAGE_GEOMETRY_1DB &&
dst_geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
blit_code = &blit_code_catalog.at(KERNEL_OP_COPY_IMAGE_REG_TO_1DB);
} else {
blit_code = &blit_code_catalog.at(KERNEL_OP_COPY_IMAGE_1DB);
}
} else {
blit_code = &blit_code_catalog.at(copy_type);
}
hsa_kernel_dispatch_packet_t packet = {0};
packet.kernel_object = blit_code->code_handle_;
packet.group_segment_size = blit_code->group_segment_size_;
packet.private_segment_size = blit_code->private_segment_size_;
// Setup kernel argument.
struct KernelArgs {
uint64_t src[5];
uint64_t dst[5];
int32_t srcOrigin[4];
int32_t dstOrigin[4];
int32_t srcFormat;
int32_t dstFormat;
OCLHiddenArgs ocl;
};
KernelArgs* args = (KernelArgs*)Allocate(dst_image_view->component, sizeof(KernelArgs));
assert(args != NULL);
memset(args, 0, sizeof(KernelArgs));
for(auto& img : args->src)
img = src_image_view->Convert();
args->srcFormat = src_image_view->desc.geometry;
args->srcOrigin[0] = src_origin.x;
args->srcOrigin[1] = src_origin.y;
args->srcOrigin[2] = src_origin.z;
for(auto& img : args->dst)
img = dst_image_view->Convert();
args->dstFormat = dst_image_view->desc.geometry;
args->dstOrigin[0] = dst_origin.x;
args->dstOrigin[1] = dst_origin.y;
args->dstOrigin[2] = dst_origin.z;
packet.kernarg_address = args;
// Setup packet dimension and working size.
CalcWorkingSize(*src_image_view, *dst_image_view, size, packet);
hsa_status_t status = LaunchKernel(blit_queue, packet);
if (&src_image != src_image_view) {
Image::Destroy(src_image_view);
}
if (&dst_image != dst_image_view) {
Image::Destroy(dst_image_view);
}
hsa_amd_memory_pool_free(args);
return status;
}
hsa_status_t BlitKernel::FillImage(
BlitQueue& blit_queue, const std::vector<BlitCodeInfo>& blit_code_catalog,
const Image& image, const void* pattern,
const hsa_ext_image_region_t& region) {
hsa_kernel_dispatch_packet_t packet = {0};
const BlitCodeInfo& blit_code =
(image.desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB)
? blit_code_catalog.at(KERNEL_OP_CLEAR_IMAGE)
: blit_code_catalog.at(KERNEL_OP_CLEAR_IMAGE_1DB);
packet.kernel_object = blit_code.code_handle_;
packet.group_segment_size = blit_code.group_segment_size_;
packet.private_segment_size = blit_code.private_segment_size_;
// Setup kernel argument.
struct KernelArgs {
uint64_t image[5];
int32_t format;
uint32_t type;
uint32_t data[4];
int32_t origin[4];
OCLHiddenArgs ocl;
};
KernelArgs* args = (KernelArgs*)Allocate(image.component, sizeof(KernelArgs));
assert(args != NULL);
memset(args, 0, sizeof(KernelArgs));
for(auto &img : args->image)
img = image.Convert();
args->format = image.desc.geometry;
for(int i=0; i<4; i++)
args->data[i] = ((const uint32_t*)pattern)[i];
args->origin[0] = region.offset.x;
args->origin[1] = region.offset.y;
args->origin[2] = region.offset.z;
args->type = GetImageAccessType(image);
packet.kernarg_address = args;
// Setup packet dimension and working size.
CalcWorkingSize(image, region.range, packet);
hsa_status_t status = LaunchKernel(blit_queue, packet);
hsa_amd_memory_pool_free(args);
return status;
}
const char *BlitKernel::kernel_name_[KERNEL_OP_COUNT] = {
"&__copy_image_to_buffer_kernel",
"&__copy_buffer_to_image_kernel",
"&__copy_image_default_kernel",
"&__copy_image_linear_to_standard_kernel",
"&__copy_image_standard_to_linear_kernel",
"&__copy_image_1db_kernel",
"&__copy_image_1db_to_reg_kernel",
"&__copy_image_reg_to_1db_kernel",
"&__clear_image_kernel",
"&__clear_image_1db_kernel"};
const char *BlitKernel::ocl_kernel_name_[KERNEL_OP_COUNT] = {
"copy_image_to_buffer.kd",
"copy_buffer_to_image.kd",
"copy_image_default.kd",
"copy_image_linear_to_standard.kd",
"copy_image_standard_to_linear.kd",
"copy_image_1db.kd",
"copy_image_1db_to_reg.kd",
"copy_image_reg_to_1db.kd",
"clear_image.kd",
"clear_image_1db.kd"};
hsa_status_t BlitKernel::PopulateKernelCode(
hsa_agent_t agent, hsa_executable_t executable,
std::vector<BlitCodeInfo>& blit_code_catalog) {
blit_code_catalog.clear();
for (int i = 0; i < KERNEL_OP_COUNT; ++i) {
// Get symbol handle.
hsa_executable_symbol_t kernel_symbol = {0};
hsa_status_t status = hsa_executable_get_symbol_by_name(executable, ocl_kernel_name_[i], &agent, &kernel_symbol);
if (HSA_STATUS_SUCCESS != status) {
blit_code_catalog.clear();
return status;
}
// Get code handle.
BlitCodeInfo blit_code = {0};
status = hsa_executable_symbol_get_info(
kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&blit_code.code_handle_);
if (HSA_STATUS_SUCCESS != status) {
blit_code_catalog.clear();
return status;
}
status = hsa_executable_symbol_get_info(
kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&blit_code.group_segment_size_);
if (HSA_STATUS_SUCCESS != status) {
blit_code_catalog.clear();
return status;
}
status = hsa_executable_symbol_get_info(
kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&blit_code.private_segment_size_);
if (HSA_STATUS_SUCCESS != status) {
blit_code_catalog.clear();
return status;
}
blit_code_catalog.push_back(blit_code);
}
assert(blit_code_catalog.size() == KERNEL_OP_COUNT);
return HSA_STATUS_SUCCESS;
}
void BlitKernel::CalcBufferRowSlicePitchesInPixel(
hsa_ext_image_geometry_t geometry, uint32_t element_size,
const hsa_dim3_t& copy_size, size_t in_row_pitch_byte,
size_t in_slice_pitch_byte, unsigned long* out_pitch_pixel) {
const bool is_1d_array =
(geometry == HSA_EXT_IMAGE_GEOMETRY_1DA) ? true : false;
out_pitch_pixel[0] =
std::max(static_cast<unsigned long>(copy_size.x),
static_cast<unsigned long>(in_row_pitch_byte / element_size));
out_pitch_pixel[1] =
(is_1d_array)
? out_pitch_pixel[0]
: (std::max(
static_cast<unsigned long>(out_pitch_pixel[0] * copy_size.y),
static_cast<unsigned long>(in_slice_pitch_byte /
element_size)));
assert((out_pitch_pixel[0] <= out_pitch_pixel[1]));
}
uint32_t BlitKernel::GetDimSize(const Image& image) {
static const uint32_t kDimSizeTable[] = {
1, // HSA_EXT_IMAGE_GEOMETRY_1D
2, // HSA_EXT_IMAGE_GEOMETRY_2D
3, // HSA_EXT_IMAGE_GEOMETRY_3D
2, // HSA_EXT_IMAGE_GEOMETRY_1DA
3, // HSA_EXT_IMAGE_GEOMETRY_2DA
1, // HSA_EXT_IMAGE_GEOMETRY_1DB
2, // HSA_EXT_IMAGE_GEOMETRY_2DDEPTH
3, // HSA_EXT_IMAGE_GEOMETRY_2DADEPTH
};
return kDimSizeTable[image.desc.geometry];
}
uint32_t BlitKernel::GetNumChannel(const Image& image) {
static const uint32_t kNumChannelTable[] = {
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_A,
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_R,
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_RX,
2, // HSA_EXT_IMAGE_CHANNEL_ORDER_RG,
2, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGX,
2, // HSA_EXT_IMAGE_CHANNEL_ORDER_RA,
3, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGB,
3, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX,
4, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA,
4, // HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA,
4, // HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB,
4, // HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR,
3, // HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB,
3, // HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX,
4, // HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA,
4, // HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA,
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY,
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE,
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH,
1, // HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL
};
return kNumChannelTable[image.desc.format.channel_order];
}
uint32_t BlitKernel::GetImageAccessType(const Image& image) {
enum AccessType {
ACCESS_TYPE_F = 0,
ACCESS_TYPE_I = 1,
ACCESS_TYPE_UI = 2,
};
static const uint32_t kAccessType[] = {
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010
ACCESS_TYPE_I, // HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8
ACCESS_TYPE_I, // HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16
ACCESS_TYPE_I, // HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32
ACCESS_TYPE_UI, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
ACCESS_TYPE_UI, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
ACCESS_TYPE_UI, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
ACCESS_TYPE_F, // HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT
ACCESS_TYPE_F // HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT
};
return kAccessType[image.desc.format.channel_type];
}
void BlitKernel::CalcWorkingSize(const Image& image, const hsa_dim3_t& range,
hsa_kernel_dispatch_packet_t& packet) {
switch (image.desc.geometry) {
case HSA_EXT_IMAGE_GEOMETRY_1D:
case HSA_EXT_IMAGE_GEOMETRY_1DB:
case HSA_EXT_IMAGE_GEOMETRY_1DA:
packet.setup = 2;
packet.grid_size_x = range.x;
packet.grid_size_y = range.y;
packet.grid_size_z = 1;
packet.workgroup_size_x = 64;
packet.workgroup_size_y = packet.workgroup_size_z = 1;
break;
case HSA_EXT_IMAGE_GEOMETRY_2D:
case HSA_EXT_IMAGE_GEOMETRY_2DDEPTH:
case HSA_EXT_IMAGE_GEOMETRY_2DADEPTH:
case HSA_EXT_IMAGE_GEOMETRY_2DA:
packet.setup = 3;
packet.grid_size_x = range.x;
packet.grid_size_y = range.y;
packet.grid_size_z = range.z;
packet.workgroup_size_x = packet.workgroup_size_y = 8;
packet.workgroup_size_z = 1;
break;
case HSA_EXT_IMAGE_GEOMETRY_3D:
packet.setup = 3;
packet.grid_size_x = range.x;
packet.grid_size_y = range.y;
packet.grid_size_z = range.z;
packet.workgroup_size_x = packet.workgroup_size_y = 4;
packet.workgroup_size_z = 4;
break;
}
}
void BlitKernel::CalcWorkingSize(const Image& src_image, const Image& dst_image,
const hsa_dim3_t& range,
hsa_kernel_dispatch_packet_t& packet) {
if (GetDimSize(src_image) < GetDimSize(dst_image)) {
CalcWorkingSize(src_image, range, packet);
} else {
CalcWorkingSize(dst_image, range, packet);
}
}
hsa_status_t BlitKernel::ConvertImage(const Image& original_image,
const Image** new_image) {
// To simplify the kernel, some particular image channel types are converted
// to a new channel type, while preserving the actual per pixel size.
// E.g.: a UNORM SIGNED INT8 is converted into UNSIGNED INT8. This way the
// kernel can just use read_imageui on all images.
static const uint32_t kTypeConvertTable[] = {
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, // HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, // HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, // HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, // HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, // HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 // HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT
};
// To simplify the kernel, some particular image channel orders are converted
// to a new channel order, while preserving the actual per pixel size.
// E.g.: a CHANNEL ORDER A is converted into CHANNEL ORDER R. This way the
// kernel can just read the first components of vector4 on all images.
static const uint32_t kOrderConvertTable[] = {
HSA_EXT_IMAGE_CHANNEL_ORDER_R, // HSA_EXT_IMAGE_CHANNEL_ORDER_A
HSA_EXT_IMAGE_CHANNEL_ORDER_R, // HSA_EXT_IMAGE_CHANNEL_ORDER_R
HSA_EXT_IMAGE_CHANNEL_ORDER_R, // HSA_EXT_IMAGE_CHANNEL_ORDER_RX
HSA_EXT_IMAGE_CHANNEL_ORDER_RG, // HSA_EXT_IMAGE_CHANNEL_ORDER_RG
HSA_EXT_IMAGE_CHANNEL_ORDER_RG, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGX
HSA_EXT_IMAGE_CHANNEL_ORDER_RG, // HSA_EXT_IMAGE_CHANNEL_ORDER_RA
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGB
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, // HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA
HSA_EXT_IMAGE_CHANNEL_ORDER_R, // HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY
HSA_EXT_IMAGE_CHANNEL_ORDER_R, // HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE
HSA_EXT_IMAGE_CHANNEL_ORDER_R, // HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH
HSA_EXT_IMAGE_CHANNEL_ORDER_RG // HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL
};
const uint32_t current_type = original_image.desc.format.channel_type;
uint32_t converted_type = kTypeConvertTable[current_type];
const uint32_t current_order = original_image.desc.format.channel_order;
uint32_t converted_order = kOrderConvertTable[current_order];
if ((current_type == converted_type) && (current_order == converted_order)) {
*new_image = &original_image;
return HSA_STATUS_SUCCESS;
}
// Handle formats that drop channels on conversion, only usable with RGB(X)
if((current_type == HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555) ||
(current_type == HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565) ||
(current_type == HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010)) {
converted_order = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
}
// For internal book keeping, depth isn't a HW type.
const hsa_ext_image_geometry_t current_geometry =
original_image.desc.geometry;
hsa_ext_image_geometry_t converted_geometry = current_geometry;
if (converted_geometry == HSA_EXT_IMAGE_GEOMETRY_2DDEPTH) {
converted_geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
} else if (converted_geometry == HSA_EXT_IMAGE_GEOMETRY_2DADEPTH) {
converted_geometry = HSA_EXT_IMAGE_GEOMETRY_2DA;
}
hsa_ext_image_format_t new_format = {
static_cast<hsa_ext_image_channel_type_t>(converted_type),
static_cast<hsa_ext_image_channel_order_t>(converted_order)};
amd::Image* new_image_handle = amd::Image::Create(original_image.component);
*new_image_handle=original_image;
new_image_handle->desc.geometry = converted_geometry;
hsa_status_t status = ext_image::ImageRuntime::instance()
->image_manager(new_image_handle->component)
->ModifyImageSrd(*new_image_handle, new_format);
if (status != HSA_STATUS_SUCCESS) {
return status;
}
*new_image = new_image_handle;
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitKernel::LaunchKernel(BlitQueue& blit_queue,
hsa_kernel_dispatch_packet_t& packet) {
static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID;
static const uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(0 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
// Copying the packet content to the queue buffer is not atomic, so it is
// possible that the packet has a valid packet type but invalid content.
// To make sure packet processor does not read invalid packet, we first
// initialized the packet type to invalid.
packet.header = kInvalidPacketHeader;
// Setup completion signal.
hsa_signal_t kernel_signal = {0};
hsa_status_t status = hsa_signal_create(1, 0, NULL, &kernel_signal);
if (HSA_STATUS_SUCCESS != status) {
return status;
}
packet.completion_signal = kernel_signal;
// Populate the queue.
hsa_queue_t* queue = blit_queue.queue_;
const uint32_t bitmask = queue->size - 1;
// Reserve write index.
uint64_t write_index = hsa_queue_add_write_index_acq_rel(queue, 1);
while (true) {
// Wait until we have room in the queue;
const uint64_t read_index = hsa_queue_load_read_index_relaxed(queue);
if ((write_index - read_index) < queue->size) {
break;
}
}
// Populate queue buffer with AQL packet.
hsa_kernel_dispatch_packet_t* queue_buffer =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address);
queue_buffer[write_index & bitmask] = packet;
std::atomic_thread_fence(std::memory_order_release);
// Enable packet.
queue_buffer[write_index & bitmask].header = kDispatchPacketHeader;
// Update doorbel register.
hsa_signal_store_release(queue->doorbell_signal, write_index);
// Wait for the packet to finish.
if (hsa_signal_wait_acquire(kernel_signal, HSA_SIGNAL_CONDITION_LT, 1,
uint64_t(-1), HSA_WAIT_STATE_ACTIVE) != 0) {
status = hsa_signal_destroy(kernel_signal);
assert(status == HSA_STATUS_SUCCESS);
// Signal wait returned unexpected value.
return HSA_STATUS_ERROR;
}
// Cleanup
status = hsa_signal_destroy(kernel_signal);
assert(status == HSA_STATUS_SUCCESS);
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
uint8_t** blit_code_object) {
if (strncmp(agent_name, "gfx", 3) != 0) {
return HSA_STATUS_ERROR_INVALID_ISA_NAME;
}
uint64_t target_name = atoi(&agent_name[3]);
switch (target_name) {
case 700:
*blit_code_object = ocl_blit_object_gfx700;
break;
case 701:
*blit_code_object = ocl_blit_object_gfx701;
break;
case 702:
*blit_code_object = ocl_blit_object_gfx702;
break;
case 801:
*blit_code_object = ocl_blit_object_gfx801;
break;
case 802:
*blit_code_object = ocl_blit_object_gfx802;
break;
case 803:
*blit_code_object = ocl_blit_object_gfx803;
break;
case 900:
*blit_code_object = ocl_blit_object_gfx900;
break;
case 902:
*blit_code_object = ocl_blit_object_gfx902;
break;
case 904:
*blit_code_object = ocl_blit_object_gfx904;
break;
case 906:
*blit_code_object = ocl_blit_object_gfx906;
break;
case 908:
*blit_code_object = ocl_blit_object_gfx908;
break;
case 1010:
*blit_code_object = ocl_blit_object_gfx1010;
break;
case 1011:
*blit_code_object = ocl_blit_object_gfx1011;
break;
case 1012:
*blit_code_object = ocl_blit_object_gfx1012;
break;
default:
return HSA_STATUS_ERROR_INVALID_ISA_NAME;
}
return HSA_STATUS_SUCCESS;
}
} // namespace amd
#undef HSA_ARGUMENT_ALIGN_BYTES