SWDEV-432174 - Change the fillBuffer kernel
- Add the new fillBuffer kernel, which allows to launch a limited number of workgroups for memory fill operation - Switch fill memory to 16 bytes write by default - Allow to limit the workgroups with DEBUG_CLR_LIMIT_BLIT_WG Change-Id: Ibad1822f2d42b2fc71bcfc1917c31409c0623e8e
Este cometimento está contido em:
+41
-67
@@ -344,7 +344,7 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
|
||||
bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin,
|
||||
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
|
||||
bool entire, size_t rowPitch, size_t slicePitch,
|
||||
bool entire, size_t rowPitch, size_t slicePitch,
|
||||
amd::CopyMetadata copyMetadata) const {
|
||||
size_t startLayer = srcOrigin[2];
|
||||
size_t numLayers = size[2];
|
||||
@@ -475,7 +475,7 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor
|
||||
|
||||
bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire,
|
||||
const amd::Coord3D& size, bool entire,
|
||||
amd::CopyMetadata copyMetadata) const {
|
||||
size_t startLayer = srcOrigin[2];
|
||||
size_t numLayers = size[2];
|
||||
@@ -685,49 +685,41 @@ uint32_t HostBlitManager::sRGBmap(float fc) const {
|
||||
return (uint32_t)(c * 255.0 + 0.5);
|
||||
}
|
||||
|
||||
bool HostBlitManager::FillBufferInfo::ExpandPattern64(uint64_t pattern, size_t pattern_size,
|
||||
uint64_t& pattern64) {
|
||||
// ================================================================================================
|
||||
void HostBlitManager::FillBufferInfo::ExpandPattern(uint32_t pattern_size, const void* pattern) {
|
||||
// If pattern size exceeds extended, then runtime will select the normal path
|
||||
if (pattern_size >= kExtendedSize) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool retval = true;
|
||||
|
||||
do {
|
||||
|
||||
// If the pattern is 0 or if the pattern_size is same as max size.
|
||||
if (pattern == 0 || pattern_size == sizeof(uint64_t)) {
|
||||
pattern64 = pattern;
|
||||
break;
|
||||
pattern_expanded_ = true;
|
||||
if (pattern_size == sizeof(uint8_t)) {
|
||||
uint8_t pattern_byte = *reinterpret_cast<const uint8_t*>(pattern);
|
||||
for (uint32_t i = 0; i < kExtendedSize; ++i) {
|
||||
reinterpret_cast<uint8_t*>(expanded_pattern_)[i] = pattern_byte;
|
||||
}
|
||||
|
||||
// Clean Curr_pattern, since it was casted off from const void* with a lesser size than size_t.
|
||||
ClearBits64(pattern, (pattern_size * 8));
|
||||
pattern64 = 0;
|
||||
|
||||
if (pattern_size == sizeof(uint8_t)) {
|
||||
pattern = pattern & 0xff;
|
||||
pattern64 = ((pattern << 56) | (pattern << 48) | (pattern << 40) | (pattern << 32)
|
||||
| (pattern << 24) | (pattern << 16) | (pattern << 8) | (pattern));
|
||||
} else if (pattern_size == sizeof(uint16_t)) {
|
||||
pattern = pattern & 0xffff;
|
||||
pattern64 = ((pattern << 48) | (pattern << 32) | (pattern << 16) | (pattern));
|
||||
} else if (pattern_size == sizeof(uint32_t)) {
|
||||
pattern = pattern & 0xffffffff;
|
||||
pattern64 = ((pattern << 32) | (pattern));
|
||||
} else {
|
||||
LogPrintfError("Unsupported Pattern size: %u \n", pattern_size);
|
||||
retval = false;
|
||||
break;
|
||||
} else if (pattern_size == sizeof(uint16_t)) {
|
||||
uint16_t pattern_word = *reinterpret_cast<const uint16_t*>(pattern);
|
||||
for (uint32_t i = 0; i < kExtendedSize / sizeof(uint16_t); ++i) {
|
||||
reinterpret_cast<uint16_t*>(expanded_pattern_)[i] = pattern_word;
|
||||
}
|
||||
|
||||
} while (0);
|
||||
|
||||
return retval;
|
||||
} else if (pattern_size == sizeof(uint32_t)) {
|
||||
uint32_t pattern_dword = *reinterpret_cast<const uint32_t*>(pattern);
|
||||
for (uint32_t i = 0; i < kExtendedSize / sizeof(uint32_t); ++i) {
|
||||
reinterpret_cast<uint32_t*>(expanded_pattern_)[i] = pattern_dword;
|
||||
}
|
||||
} else {
|
||||
uint64_t pattern_qword = *reinterpret_cast<const uint64_t*>(pattern);
|
||||
reinterpret_cast<uint64_t*>(expanded_pattern_)[0] = pattern_qword;
|
||||
reinterpret_cast<uint64_t*>(expanded_pattern_)[1] = pattern_qword;
|
||||
}
|
||||
}
|
||||
|
||||
bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size,
|
||||
size_t fill_origin, const void* pattern_ptr,
|
||||
size_t pattern_size,
|
||||
std::vector<FillBufferInfo>& packed_info) {
|
||||
|
||||
// ================================================================================================
|
||||
void HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size,
|
||||
size_t fill_origin, const void* pattern_ptr,
|
||||
size_t pattern_size,
|
||||
std::vector<FillBufferInfo>& packed_info) {
|
||||
// 1. Validate input arguments
|
||||
guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
|
||||
pattern_size, fill_size);
|
||||
@@ -736,60 +728,42 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz
|
||||
|
||||
// 2. Calculate the next closest dword aligned address for faster processing
|
||||
size_t dst_addr = memory.virtualAddress() + fill_origin;
|
||||
size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
|
||||
size_t aligned_dst_addr = amd::alignUp(dst_addr, kExtendedSize);
|
||||
guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
|
||||
"address :%u \n", aligned_dst_addr, dst_addr);
|
||||
|
||||
// 3. If given address is not aligned calculate head and tail size.
|
||||
size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);
|
||||
size_t aligned_size = ((fill_size - head_size) / sizeof(size_t)) * sizeof(size_t);
|
||||
size_t tail_size = (fill_size - head_size) % sizeof(size_t);
|
||||
size_t aligned_size = ((fill_size - head_size) / kExtendedSize) * kExtendedSize;
|
||||
size_t tail_size = (fill_size - head_size) % kExtendedSize;
|
||||
guarantee((head_size + aligned_size + tail_size) <= fill_size, "Head size, aligned size & tail"
|
||||
"size together cannot cross fill size");
|
||||
|
||||
// 4. Clear unwanted bytes from the pattern if the pattern size is < sizeof(size_t).
|
||||
uint64_t pattern = *(reinterpret_cast<uint64_t*>(const_cast<void*>(pattern_ptr)));
|
||||
if (pattern_size < sizeof(uint64_t)) {
|
||||
ClearBits64(pattern, (pattern_size * 8));
|
||||
}
|
||||
|
||||
// 5. Fill the head, aligned, tail info if they exist.
|
||||
FillBufferInfo fill_info;
|
||||
// 4. Fill the head, aligned, tail info if they exist.
|
||||
if (head_size > 0) {
|
||||
// Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
|
||||
guarantee((head_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");
|
||||
|
||||
fill_info.fill_size_ = head_size;
|
||||
FillBufferInfo fill_info(head_size);
|
||||
packed_info.push_back(fill_info);
|
||||
}
|
||||
|
||||
fill_info.clearInfo();
|
||||
if (aligned_size > 0) {
|
||||
// Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
|
||||
guarantee((aligned_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");
|
||||
|
||||
if (pattern_size < sizeof(uint64_t)) {
|
||||
if (!ExpandPattern64(pattern, pattern_size, fill_info.expanded_pattern_)) {
|
||||
DevLogPrintfError("Failed Expanding the pattern for pattern:%u, pattern_size: %u",
|
||||
pattern, pattern_size);
|
||||
return false;
|
||||
}
|
||||
fill_info.pattern_expanded_ = true;
|
||||
}
|
||||
fill_info.fill_size_ = aligned_size;
|
||||
FillBufferInfo fill_info(aligned_size);
|
||||
fill_info.ExpandPattern(pattern_size, pattern_ptr);
|
||||
packed_info.push_back(fill_info);
|
||||
}
|
||||
fill_info.clearInfo();
|
||||
|
||||
if (tail_size > 0) {
|
||||
// Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
|
||||
guarantee((tail_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");
|
||||
|
||||
fill_info.fill_size_ = tail_size;
|
||||
FillBufferInfo fill_info(tail_size);
|
||||
packed_info.push_back(fill_info);
|
||||
}
|
||||
fill_info.clearInfo();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
+21
-33
@@ -78,7 +78,7 @@ class BlitManager : public amd::HeapObject {
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata()//!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -89,7 +89,7 @@ class BlitManager : public amd::HeapObject {
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -101,7 +101,7 @@ class BlitManager : public amd::HeapObject {
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata()//!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -111,7 +111,7 @@ class BlitManager : public amd::HeapObject {
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -122,7 +122,7 @@ class BlitManager : public amd::HeapObject {
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -134,7 +134,7 @@ class BlitManager : public amd::HeapObject {
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -193,7 +193,7 @@ class BlitManager : public amd::HeapObject {
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const = 0;
|
||||
|
||||
@@ -301,7 +301,7 @@ class HostBlitManager : public device::BlitManager {
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const;
|
||||
|
||||
@@ -334,7 +334,7 @@ class HostBlitManager : public device::BlitManager {
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const;
|
||||
|
||||
@@ -393,7 +393,7 @@ class HostBlitManager : public device::BlitManager {
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata copyMetadata =
|
||||
amd::CopyMetadata() //!< Memory copy MetaData
|
||||
) const;
|
||||
|
||||
@@ -423,36 +423,24 @@ class HostBlitManager : public device::BlitManager {
|
||||
const amd::Device& dev_; //!< Physical device
|
||||
|
||||
// Packed Fill Buffer
|
||||
class FillBufferInfo {
|
||||
public:
|
||||
FillBufferInfo(): fill_size_(0), expanded_pattern_(0), pattern_expanded_(false) {}
|
||||
struct FillBufferInfo {
|
||||
static constexpr uint32_t kExtendedSize = 2 * sizeof(uint64_t);
|
||||
|
||||
static bool PackInfo(const device::Memory& memory, size_t fill_size,
|
||||
size_t fill_origin, const void* pattern, size_t pattern_size,
|
||||
std::vector<FillBufferInfo>& packed_info);
|
||||
|
||||
private:
|
||||
static bool ExpandPattern64(uint64_t pattern, size_t pattern_size, uint64_t& pattern64);
|
||||
|
||||
static inline void ClearBits64(uint64_t& pattern, uint64_t num_bits) {
|
||||
pattern &= ~(~(static_cast<uint64_t>(0)) << num_bits);
|
||||
static void PackInfo(const device::Memory& memory, size_t fill_size,
|
||||
size_t fill_origin, const void* pattern, size_t pattern_size,
|
||||
std::vector<FillBufferInfo>& packed_info);
|
||||
|
||||
FillBufferInfo(size_t fill_size): fill_size_(fill_size), pattern_expanded_(false) {
|
||||
memset(&expanded_pattern_, 0, sizeof(expanded_pattern_));
|
||||
}
|
||||
|
||||
void clearInfo () {
|
||||
fill_size_ = 0;
|
||||
expanded_pattern_ = 0;
|
||||
pattern_expanded_ = false;
|
||||
}
|
||||
void ExpandPattern(uint32_t pattern_size, const void* pattern);
|
||||
|
||||
public:
|
||||
size_t fill_size_; // Fill size for this command
|
||||
uint64_t expanded_pattern_; // Pattern for this command
|
||||
bool pattern_expanded_; // Boolean to check if pattern is expanded
|
||||
size_t fill_size_; //!< Fill size for this command
|
||||
uint8_t expanded_pattern_[kExtendedSize]; //!< Pattern for this command - 16 bytes
|
||||
bool pattern_expanded_; //!< Boolean to check if pattern is expanded
|
||||
};
|
||||
|
||||
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
HostBlitManager(const HostBlitManager&);
|
||||
|
||||
+51
-10
@@ -47,16 +47,57 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
|
||||
|
||||
extern void __ockl_gws_init(uint nwm1, uint rid);
|
||||
|
||||
// Implementation
|
||||
__kernel void __amd_rocclr_fillBufferAligned(__global uchar* bufUChar,
|
||||
__global ushort* bufUShort,
|
||||
__global uint* bufUInt,
|
||||
__global ulong* bufULong,
|
||||
__constant uchar* pattern,
|
||||
uint patternSize, ulong offset,
|
||||
ulong size) {
|
||||
__amd_fillBufferAligned(bufUChar, bufUShort, bufUInt, bufULong,
|
||||
pattern, patternSize, offset, size);
|
||||
__kernel void __amd_rocclr_fillBufferAligned(
|
||||
__global uchar* bufUChar, __global ushort* bufUShort, __global uint* bufUInt,
|
||||
__global ulong* bufULong, __global ulong2* bufULong2, __constant uchar* pattern,
|
||||
uint pattern_size, ulong offset, ulong end_ptr, uint next_chunk) {
|
||||
int id = get_global_id(0);
|
||||
long cur_id = offset + id * pattern_size;
|
||||
if (bufULong2) {
|
||||
__global ulong2* element = &bufULong2[cur_id];
|
||||
__constant ulong2* pt = (__constant ulong2*)pattern;
|
||||
while ((ulong)element < end_ptr) {
|
||||
for (uint i = 0; i < pattern_size; ++i) {
|
||||
element[i] = pt[i];
|
||||
}
|
||||
element += next_chunk;
|
||||
}
|
||||
} else if (bufULong) {
|
||||
__global ulong* element = &bufULong[cur_id];
|
||||
__constant ulong* pt = (__constant ulong*)pattern;
|
||||
while ((ulong)element < end_ptr) {
|
||||
for (uint i = 0; i < pattern_size; ++i) {
|
||||
element[i] = pt[i];
|
||||
}
|
||||
element += next_chunk;
|
||||
}
|
||||
} else if (bufUInt) {
|
||||
__global uint* element = &bufUInt[cur_id];
|
||||
__constant uint* pt = (__constant uint*)pattern;
|
||||
while ((ulong)element < end_ptr) {
|
||||
for (uint i = 0; i < pattern_size; ++i) {
|
||||
element[i] = pt[i];
|
||||
}
|
||||
element += next_chunk;
|
||||
}
|
||||
} else if (bufUShort) {
|
||||
__global ushort* element = &bufUShort[cur_id];
|
||||
__constant ushort* pt = (__constant ushort*)pattern;
|
||||
while ((ulong)element < end_ptr) {
|
||||
for (uint i = 0; i < pattern_size; ++i) {
|
||||
element[i] = pt[i];
|
||||
}
|
||||
element += next_chunk;
|
||||
}
|
||||
} else {
|
||||
__global uchar* element = &bufUChar[cur_id];
|
||||
while ((ulong)element < end_ptr) {
|
||||
for (uint i = 0; i < pattern_size; ++i) {
|
||||
element[i] = pattern[i];
|
||||
}
|
||||
element += next_chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void __amd_rocclr_fillBufferAligned2D(__global uchar* bufUChar,
|
||||
|
||||
@@ -2172,63 +2172,96 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
|
||||
synchronize();
|
||||
return result;
|
||||
} else {
|
||||
uint fillType = FillBufferAligned;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
uint64_t fillSize = size[0] / patternSize;
|
||||
size_t globalWorkSize = amd::alignUp(fillSize, 256);
|
||||
size_t localWorkSize = 256;
|
||||
uint32_t alignment = (patternSize & 0x7) == 0 ?
|
||||
sizeof(uint64_t) :
|
||||
(patternSize & 0x3) == 0 ?
|
||||
sizeof(uint32_t) :
|
||||
(patternSize & 0x1) == 0 ?
|
||||
sizeof(uint16_t) : sizeof(uint8_t);
|
||||
// Pack the fill buffer info, that handles unaligned memories.
|
||||
std::vector<FillBufferInfo> packed_vector{};
|
||||
FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector);
|
||||
|
||||
// Program kernels arguments for the fill operation
|
||||
Memory* mem = &gpuMem(memory);
|
||||
if (alignment == sizeof(uint64_t)) {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
|
||||
} else if (alignment == sizeof(uint32_t)) {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
} else if (alignment == sizeof(uint16_t)) {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
} else {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
size_t overall_offset = origin[0];
|
||||
for (auto& packed_obj : packed_vector) {
|
||||
constexpr uint32_t kFillType = FillBufferAligned;
|
||||
uint32_t kpattern_size = (packed_obj.pattern_expanded_)
|
||||
? HostBlitManager::FillBufferInfo::kExtendedSize
|
||||
: patternSize;
|
||||
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
|
||||
size_t koffset = overall_offset;
|
||||
overall_offset += packed_obj.fill_size_;
|
||||
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) :
|
||||
(kpattern_size & 0x7) == 0 ? sizeof(uint64_t) :
|
||||
(kpattern_size & 0x3) == 0 ? sizeof(uint32_t) :
|
||||
(kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
|
||||
|
||||
// Program kernels arguments for the fill operation
|
||||
Memory* mem = &gpuMem(memory);
|
||||
if (alignment == 2 * sizeof(uint64_t)) {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
|
||||
} else if (alignment == sizeof(uint64_t)) {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
} else if (alignment == sizeof(uint32_t)) {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
} else if (alignment == sizeof(uint16_t)) {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
} else {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
}
|
||||
const size_t localWorkSize = 256;
|
||||
size_t globalWorkSize =
|
||||
std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
|
||||
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
|
||||
|
||||
Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
|
||||
void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
|
||||
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern
|
||||
if (packed_obj.pattern_expanded_) {
|
||||
memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size);
|
||||
} else {
|
||||
memcpy(constBuf, pattern, kpattern_size);
|
||||
}
|
||||
gpuCB.unmap(&gpu());
|
||||
Memory* pGpuCB = &gpuCB;
|
||||
setArgument(kernels_[kFillType], 5, sizeof(cl_mem), &pGpuCB);
|
||||
uint64_t offset = origin[0];
|
||||
|
||||
// Adjust the pattern size in the copy type size
|
||||
kpattern_size /= alignment;
|
||||
setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
|
||||
koffset /= alignment;
|
||||
setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
|
||||
// Calculate max id
|
||||
kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
|
||||
setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
|
||||
uint32_t next_chunk = globalWorkSize * kpattern_size;
|
||||
setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = kernels_[kFillType]->parameters().values();
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters);
|
||||
gpu().xferWrite().Release(gpuCB);
|
||||
}
|
||||
Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
|
||||
void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
|
||||
memcpy(constBuf, pattern, patternSize);
|
||||
gpuCB.unmap(&gpu());
|
||||
Memory* pGpuCB = &gpuCB;
|
||||
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &pGpuCB);
|
||||
uint64_t offset = origin[0];
|
||||
|
||||
patternSize/= alignment;
|
||||
offset /= alignment;
|
||||
|
||||
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
|
||||
setArgument(kernels_[fillType], 6, sizeof(offset), &offset);
|
||||
setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = kernels_[fillType]->parameters().values();
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
|
||||
gpu().xferWrite().Release(gpuCB);
|
||||
}
|
||||
|
||||
synchronize();
|
||||
|
||||
@@ -148,6 +148,8 @@ Settings::Settings() {
|
||||
prepinnedMinSize_ = 0;
|
||||
cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
|
||||
useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG;
|
||||
|
||||
limit_blit_wg_ = 16;
|
||||
}
|
||||
|
||||
bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
@@ -288,7 +290,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
|
||||
// Cap at OpenCL20 for now
|
||||
if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
|
||||
|
||||
|
||||
use64BitPtr_ = LP64_SWITCH(false, true);
|
||||
|
||||
if (oclVersion_ >= OpenCL20) {
|
||||
@@ -435,6 +437,10 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
prepinnedMinSize_ = PAL_PREPINNED_MEMORY_SIZE * Ki;
|
||||
}
|
||||
|
||||
limit_blit_wg_ = enableWgpMode_
|
||||
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
|
||||
: palProp.gfxipProperties.shaderCore.numAvailableCus;
|
||||
|
||||
// Override current device settings
|
||||
override();
|
||||
|
||||
@@ -501,6 +507,10 @@ void Settings::override() {
|
||||
if (!flagIsDefault(PAL_ALWAYS_RESIDENT)) {
|
||||
alwaysResident_ = PAL_ALWAYS_RESIDENT;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
|
||||
limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
|
||||
@@ -118,6 +118,7 @@ class Settings : public device::Settings {
|
||||
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
|
||||
|
||||
size_t prepinnedMinSize_; //!< minimal memory size for prepinned transfer
|
||||
uint32_t limit_blit_wg_; //!< The number of workgroups for blit execution
|
||||
|
||||
//! Default constructor
|
||||
Settings();
|
||||
|
||||
@@ -778,6 +778,9 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
|
||||
// Add mask array for AmdAqlWrap slots
|
||||
allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
|
||||
|
||||
// Align size to 64 bytes for more efficient fill operation
|
||||
allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));
|
||||
|
||||
virtualQueue_ = new Memory(dev(), allocSize);
|
||||
Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
|
||||
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
|
||||
|
||||
@@ -2048,9 +2048,10 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern, size_t patternSize,
|
||||
const amd::Coord3D& surface, const amd::Coord3D& origin,
|
||||
const amd::Coord3D& size, bool entire, bool forceBlit) const {
|
||||
bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern,
|
||||
size_t patternSize, const amd::Coord3D& surface,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire, bool forceBlit) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
|
||||
@@ -2063,79 +2064,90 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
|
||||
synchronize();
|
||||
return result;
|
||||
} else {
|
||||
|
||||
// Pack the fill buffer info, that handles unaligned memories.
|
||||
std::vector<FillBufferInfo> packed_vector{};
|
||||
FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector);
|
||||
|
||||
size_t overall_offset = origin[0];
|
||||
for (auto& packed_obj: packed_vector) {
|
||||
uint fillType = FillBufferAligned;
|
||||
|
||||
uint32_t kpattern_size32 = (packed_obj.pattern_expanded_) ? sizeof(size_t) : patternSize;
|
||||
size_t kfill_size = packed_obj.fill_size_/kpattern_size32;
|
||||
constexpr uint32_t kFillType = FillBufferAligned;
|
||||
uint32_t kpattern_size = (packed_obj.pattern_expanded_) ?
|
||||
HostBlitManager::FillBufferInfo::kExtendedSize : patternSize;
|
||||
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
|
||||
size_t koffset = overall_offset;
|
||||
overall_offset += packed_obj.fill_size_;
|
||||
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize = amd::alignUp(kfill_size, 256);
|
||||
size_t localWorkSize = 256;
|
||||
|
||||
uint32_t alignment = (kpattern_size32 & 0x7) == 0 ?
|
||||
sizeof(uint64_t) :
|
||||
(kpattern_size32 & 0x3) == 0 ?
|
||||
sizeof(uint32_t) :
|
||||
(kpattern_size32 & 0x1) == 0 ?
|
||||
sizeof(uint16_t) : sizeof(uint8_t);
|
||||
|
||||
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) :
|
||||
(kpattern_size & 0x7) == 0 ? sizeof(uint64_t) :
|
||||
(kpattern_size & 0x3) == 0 ? sizeof(uint32_t) :
|
||||
(kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
|
||||
// Program kernels arguments for the fill operation
|
||||
cl_mem mem = as_cl<amd::Memory>(memory.owner());
|
||||
if (alignment == sizeof(uint64_t)) {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
|
||||
if (alignment == 2 * sizeof(uint64_t)) {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
|
||||
} else if (alignment == sizeof(uint64_t)) {
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
} else if (alignment == sizeof(uint32_t)) {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
} else if (alignment == sizeof(uint16_t)) {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
} else {
|
||||
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
|
||||
}
|
||||
const size_t localWorkSize = 256;
|
||||
size_t globalWorkSize =
|
||||
std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
|
||||
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
|
||||
|
||||
auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
|
||||
|
||||
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
|
||||
if (packed_obj.pattern_expanded_) {
|
||||
memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size32);
|
||||
memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size);
|
||||
} else {
|
||||
memcpy(constBuf, pattern, kpattern_size32);
|
||||
memcpy(constBuf, pattern, kpattern_size);
|
||||
}
|
||||
constexpr bool kDirectVa = true;
|
||||
setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
|
||||
setArgument(kernels_[kFillType], 5, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
|
||||
|
||||
// Adjust the pattern size in the copy type size
|
||||
kpattern_size /= alignment;
|
||||
setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
|
||||
koffset /= alignment;
|
||||
kpattern_size32 /= alignment;
|
||||
|
||||
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &kpattern_size32);
|
||||
setArgument(kernels_[fillType], 6, sizeof(koffset), &koffset);
|
||||
setArgument(kernels_[fillType], 7, sizeof(kfill_size), &kfill_size);
|
||||
setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
|
||||
// Calculate max id
|
||||
kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
|
||||
setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
|
||||
uint32_t next_chunk = globalWorkSize * kpattern_size;
|
||||
setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[fillType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr);
|
||||
address parameters = captureArguments(kernels_[kFillType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -769,7 +769,10 @@ bool Device::create() {
|
||||
pciDeviceId_);
|
||||
return false;
|
||||
}
|
||||
|
||||
hsaSettings->limit_blit_wg_ = info().maxComputeUnits_;
|
||||
if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
|
||||
hsaSettings->limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
|
||||
}
|
||||
amd::Context::Info info = {0};
|
||||
std::vector<amd::Device*> devices;
|
||||
devices.push_back(this);
|
||||
|
||||
@@ -96,6 +96,7 @@ Settings::Settings() {
|
||||
|
||||
host_hdp_flush_ = true;
|
||||
gwsInitSupported_ = true;
|
||||
limit_blit_wg_ = 16;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -76,6 +76,7 @@ class Settings : public device::Settings {
|
||||
size_t sdmaCopyThreshold_; //!< Use SDMA to copy above this size
|
||||
|
||||
uint32_t hmmFlags_; //!< HMM functionality control flags
|
||||
uint32_t limit_blit_wg_; //!< The number of workgroups for blit execution
|
||||
|
||||
//! Default constructor
|
||||
Settings();
|
||||
|
||||
@@ -2828,8 +2828,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
// Add mask array for AmdAqlWrap slots
|
||||
allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
|
||||
|
||||
// Make sure the allocation size aligns with DWORD.
|
||||
allocSize = amd::alignUp(allocSize, sizeof(uint64_t));
|
||||
// Align size to 64 bytes for more efficient fill operation
|
||||
allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));
|
||||
|
||||
// CL_MEM_ALLOC_HOST_PTR/CL_MEM_READ_WRITE
|
||||
virtualQueue_ = new (dev().context()) amd::Buffer(dev().context(), CL_MEM_READ_WRITE, allocSize);
|
||||
|
||||
@@ -221,6 +221,8 @@ release(uint, ROC_AQL_QUEUE_SIZE, 16384, \
|
||||
"AQL queue size in AQL packets") \
|
||||
release(uint, ROC_SIGNAL_POOL_SIZE, 32, \
|
||||
"Initial size of HSA signal pool") \
|
||||
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
|
||||
"Limit the number of workgroups in blit operations") \
|
||||
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
|
||||
"If true, then runtime can skip kernel arg copy") \
|
||||
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador