2
0

SWDEV-432174 - Change the fillBuffer kernel

- Add the new fillBuffer kernel, which allows to launch a limited
number of workgroups for memory fill operation
- Switch fill memory to 16 bytes write by default
- Allow to limit the workgroups with DEBUG_CLR_LIMIT_BLIT_WG

Change-Id: Ibad1822f2d42b2fc71bcfc1917c31409c0623e8e
Este cometimento está contido em:
German Andryeyev
2023-11-14 12:49:17 -05:00
ascendente 57cb840058
cometimento f1dc81f427
13 ficheiros modificados com 283 adições e 214 eliminações
+41 -67
Ver ficheiro
@@ -344,7 +344,7 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
bool entire, size_t rowPitch, size_t slicePitch,
bool entire, size_t rowPitch, size_t slicePitch,
amd::CopyMetadata copyMetadata) const {
size_t startLayer = srcOrigin[2];
size_t numLayers = size[2];
@@ -475,7 +475,7 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor
bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire,
const amd::Coord3D& size, bool entire,
amd::CopyMetadata copyMetadata) const {
size_t startLayer = srcOrigin[2];
size_t numLayers = size[2];
@@ -685,49 +685,41 @@ uint32_t HostBlitManager::sRGBmap(float fc) const {
return (uint32_t)(c * 255.0 + 0.5);
}
bool HostBlitManager::FillBufferInfo::ExpandPattern64(uint64_t pattern, size_t pattern_size,
uint64_t& pattern64) {
// ================================================================================================
void HostBlitManager::FillBufferInfo::ExpandPattern(uint32_t pattern_size, const void* pattern) {
// If pattern size exceeds extended, then runtime will select the normal path
if (pattern_size >= kExtendedSize) {
return;
}
bool retval = true;
do {
// If the pattern is 0 or if the pattern_size is same as max size.
if (pattern == 0 || pattern_size == sizeof(uint64_t)) {
pattern64 = pattern;
break;
pattern_expanded_ = true;
if (pattern_size == sizeof(uint8_t)) {
uint8_t pattern_byte = *reinterpret_cast<const uint8_t*>(pattern);
for (uint32_t i = 0; i < kExtendedSize; ++i) {
reinterpret_cast<uint8_t*>(expanded_pattern_)[i] = pattern_byte;
}
// Clean Curr_pattern, since it was casted off from const void* with a lesser size than size_t.
ClearBits64(pattern, (pattern_size * 8));
pattern64 = 0;
if (pattern_size == sizeof(uint8_t)) {
pattern = pattern & 0xff;
pattern64 = ((pattern << 56) | (pattern << 48) | (pattern << 40) | (pattern << 32)
| (pattern << 24) | (pattern << 16) | (pattern << 8) | (pattern));
} else if (pattern_size == sizeof(uint16_t)) {
pattern = pattern & 0xffff;
pattern64 = ((pattern << 48) | (pattern << 32) | (pattern << 16) | (pattern));
} else if (pattern_size == sizeof(uint32_t)) {
pattern = pattern & 0xffffffff;
pattern64 = ((pattern << 32) | (pattern));
} else {
LogPrintfError("Unsupported Pattern size: %u \n", pattern_size);
retval = false;
break;
} else if (pattern_size == sizeof(uint16_t)) {
uint16_t pattern_word = *reinterpret_cast<const uint16_t*>(pattern);
for (uint32_t i = 0; i < kExtendedSize / sizeof(uint16_t); ++i) {
reinterpret_cast<uint16_t*>(expanded_pattern_)[i] = pattern_word;
}
} while (0);
return retval;
} else if (pattern_size == sizeof(uint32_t)) {
uint32_t pattern_dword = *reinterpret_cast<const uint32_t*>(pattern);
for (uint32_t i = 0; i < kExtendedSize / sizeof(uint32_t); ++i) {
reinterpret_cast<uint32_t*>(expanded_pattern_)[i] = pattern_dword;
}
} else {
uint64_t pattern_qword = *reinterpret_cast<const uint64_t*>(pattern);
reinterpret_cast<uint64_t*>(expanded_pattern_)[0] = pattern_qword;
reinterpret_cast<uint64_t*>(expanded_pattern_)[1] = pattern_qword;
}
}
bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size,
size_t fill_origin, const void* pattern_ptr,
size_t pattern_size,
std::vector<FillBufferInfo>& packed_info) {
// ================================================================================================
void HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size,
size_t fill_origin, const void* pattern_ptr,
size_t pattern_size,
std::vector<FillBufferInfo>& packed_info) {
// 1. Validate input arguments
guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
pattern_size, fill_size);
@@ -736,60 +728,42 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz
// 2. Calculate the next closest dword aligned address for faster processing
size_t dst_addr = memory.virtualAddress() + fill_origin;
size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
size_t aligned_dst_addr = amd::alignUp(dst_addr, kExtendedSize);
guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
"address :%u \n", aligned_dst_addr, dst_addr);
// 3. If given address is not aligned calculate head and tail size.
size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);
size_t aligned_size = ((fill_size - head_size) / sizeof(size_t)) * sizeof(size_t);
size_t tail_size = (fill_size - head_size) % sizeof(size_t);
size_t aligned_size = ((fill_size - head_size) / kExtendedSize) * kExtendedSize;
size_t tail_size = (fill_size - head_size) % kExtendedSize;
guarantee((head_size + aligned_size + tail_size) <= fill_size, "Head size, aligned size & tail"
"size together cannot cross fill size");
// 4. Clear unwanted bytes from the pattern if the pattern size is < sizeof(size_t).
uint64_t pattern = *(reinterpret_cast<uint64_t*>(const_cast<void*>(pattern_ptr)));
if (pattern_size < sizeof(uint64_t)) {
ClearBits64(pattern, (pattern_size * 8));
}
// 5. Fill the head, aligned, tail info if they exist.
FillBufferInfo fill_info;
// 4. Fill the head, aligned, tail info if they exist.
if (head_size > 0) {
// Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
guarantee((head_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");
fill_info.fill_size_ = head_size;
FillBufferInfo fill_info(head_size);
packed_info.push_back(fill_info);
}
fill_info.clearInfo();
if (aligned_size > 0) {
// Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
guarantee((aligned_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");
if (pattern_size < sizeof(uint64_t)) {
if (!ExpandPattern64(pattern, pattern_size, fill_info.expanded_pattern_)) {
DevLogPrintfError("Failed Expanding the pattern for pattern:%u, pattern_size: %u",
pattern, pattern_size);
return false;
}
fill_info.pattern_expanded_ = true;
}
fill_info.fill_size_ = aligned_size;
FillBufferInfo fill_info(aligned_size);
fill_info.ExpandPattern(pattern_size, pattern_ptr);
packed_info.push_back(fill_info);
}
fill_info.clearInfo();
if (tail_size > 0) {
// Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
guarantee((tail_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");
fill_info.fill_size_ = tail_size;
FillBufferInfo fill_info(tail_size);
packed_info.push_back(fill_info);
}
fill_info.clearInfo();
return true;
}
} // namespace gpu
+21 -33
Ver ficheiro
@@ -78,7 +78,7 @@ class BlitManager : public amd::HeapObject {
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata()//!< Memory copy MetaData
) const = 0;
@@ -89,7 +89,7 @@ class BlitManager : public amd::HeapObject {
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const = 0;
@@ -101,7 +101,7 @@ class BlitManager : public amd::HeapObject {
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata()//!< Memory copy MetaData
) const = 0;
@@ -111,7 +111,7 @@ class BlitManager : public amd::HeapObject {
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const = 0;
@@ -122,7 +122,7 @@ class BlitManager : public amd::HeapObject {
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const = 0;
@@ -134,7 +134,7 @@ class BlitManager : public amd::HeapObject {
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const = 0;
@@ -193,7 +193,7 @@ class BlitManager : public amd::HeapObject {
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const = 0;
@@ -301,7 +301,7 @@ class HostBlitManager : public device::BlitManager {
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const;
@@ -334,7 +334,7 @@ class HostBlitManager : public device::BlitManager {
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const;
@@ -393,7 +393,7 @@ class HostBlitManager : public device::BlitManager {
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
amd::CopyMetadata copyMetadata =
amd::CopyMetadata copyMetadata =
amd::CopyMetadata() //!< Memory copy MetaData
) const;
@@ -423,36 +423,24 @@ class HostBlitManager : public device::BlitManager {
const amd::Device& dev_; //!< Physical device
// Packed Fill Buffer
class FillBufferInfo {
public:
FillBufferInfo(): fill_size_(0), expanded_pattern_(0), pattern_expanded_(false) {}
struct FillBufferInfo {
static constexpr uint32_t kExtendedSize = 2 * sizeof(uint64_t);
static bool PackInfo(const device::Memory& memory, size_t fill_size,
size_t fill_origin, const void* pattern, size_t pattern_size,
std::vector<FillBufferInfo>& packed_info);
private:
static bool ExpandPattern64(uint64_t pattern, size_t pattern_size, uint64_t& pattern64);
static inline void ClearBits64(uint64_t& pattern, uint64_t num_bits) {
pattern &= ~(~(static_cast<uint64_t>(0)) << num_bits);
static void PackInfo(const device::Memory& memory, size_t fill_size,
size_t fill_origin, const void* pattern, size_t pattern_size,
std::vector<FillBufferInfo>& packed_info);
FillBufferInfo(size_t fill_size): fill_size_(fill_size), pattern_expanded_(false) {
memset(&expanded_pattern_, 0, sizeof(expanded_pattern_));
}
void clearInfo () {
fill_size_ = 0;
expanded_pattern_ = 0;
pattern_expanded_ = false;
}
void ExpandPattern(uint32_t pattern_size, const void* pattern);
public:
size_t fill_size_; // Fill size for this command
uint64_t expanded_pattern_; // Pattern for this command
bool pattern_expanded_; // Boolean to check if pattern is expanded
size_t fill_size_; //!< Fill size for this command
uint8_t expanded_pattern_[kExtendedSize]; //!< Pattern for this command - 16 bytes
bool pattern_expanded_; //!< Boolean to check if pattern is expanded
};
private:
//! Disable copy constructor
HostBlitManager(const HostBlitManager&);
+51 -10
Ver ficheiro
@@ -47,16 +47,57 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
extern void __ockl_gws_init(uint nwm1, uint rid);
// Implementation
__kernel void __amd_rocclr_fillBufferAligned(__global uchar* bufUChar,
__global ushort* bufUShort,
__global uint* bufUInt,
__global ulong* bufULong,
__constant uchar* pattern,
uint patternSize, ulong offset,
ulong size) {
__amd_fillBufferAligned(bufUChar, bufUShort, bufUInt, bufULong,
pattern, patternSize, offset, size);
__kernel void __amd_rocclr_fillBufferAligned(
__global uchar* bufUChar, __global ushort* bufUShort, __global uint* bufUInt,
__global ulong* bufULong, __global ulong2* bufULong2, __constant uchar* pattern,
uint pattern_size, ulong offset, ulong end_ptr, uint next_chunk) {
int id = get_global_id(0);
long cur_id = offset + id * pattern_size;
if (bufULong2) {
__global ulong2* element = &bufULong2[cur_id];
__constant ulong2* pt = (__constant ulong2*)pattern;
while ((ulong)element < end_ptr) {
for (uint i = 0; i < pattern_size; ++i) {
element[i] = pt[i];
}
element += next_chunk;
}
} else if (bufULong) {
__global ulong* element = &bufULong[cur_id];
__constant ulong* pt = (__constant ulong*)pattern;
while ((ulong)element < end_ptr) {
for (uint i = 0; i < pattern_size; ++i) {
element[i] = pt[i];
}
element += next_chunk;
}
} else if (bufUInt) {
__global uint* element = &bufUInt[cur_id];
__constant uint* pt = (__constant uint*)pattern;
while ((ulong)element < end_ptr) {
for (uint i = 0; i < pattern_size; ++i) {
element[i] = pt[i];
}
element += next_chunk;
}
} else if (bufUShort) {
__global ushort* element = &bufUShort[cur_id];
__constant ushort* pt = (__constant ushort*)pattern;
while ((ulong)element < end_ptr) {
for (uint i = 0; i < pattern_size; ++i) {
element[i] = pt[i];
}
element += next_chunk;
}
} else {
__global uchar* element = &bufUChar[cur_id];
while ((ulong)element < end_ptr) {
for (uint i = 0; i < pattern_size; ++i) {
element[i] = pattern[i];
}
element += next_chunk;
}
}
}
__kernel void __amd_rocclr_fillBufferAligned2D(__global uchar* bufUChar,
+88 -55
Ver ficheiro
@@ -2172,63 +2172,96 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
synchronize();
return result;
} else {
uint fillType = FillBufferAligned;
size_t globalWorkOffset[3] = {0, 0, 0};
uint64_t fillSize = size[0] / patternSize;
size_t globalWorkSize = amd::alignUp(fillSize, 256);
size_t localWorkSize = 256;
uint32_t alignment = (patternSize & 0x7) == 0 ?
sizeof(uint64_t) :
(patternSize & 0x3) == 0 ?
sizeof(uint32_t) :
(patternSize & 0x1) == 0 ?
sizeof(uint16_t) : sizeof(uint8_t);
// Pack the fill buffer info, that handles unaligned memories.
std::vector<FillBufferInfo> packed_vector{};
FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector);
// Program kernels arguments for the fill operation
Memory* mem = &gpuMem(memory);
if (alignment == sizeof(uint64_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
} else if (alignment == sizeof(uint32_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
} else if (alignment == sizeof(uint16_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
} else {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
size_t overall_offset = origin[0];
for (auto& packed_obj : packed_vector) {
constexpr uint32_t kFillType = FillBufferAligned;
uint32_t kpattern_size = (packed_obj.pattern_expanded_)
? HostBlitManager::FillBufferInfo::kExtendedSize
: patternSize;
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
size_t koffset = overall_offset;
overall_offset += packed_obj.fill_size_;
size_t globalWorkOffset[3] = {0, 0, 0};
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) :
(kpattern_size & 0x7) == 0 ? sizeof(uint64_t) :
(kpattern_size & 0x3) == 0 ? sizeof(uint32_t) :
(kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
// Program kernels arguments for the fill operation
Memory* mem = &gpuMem(memory);
if (alignment == 2 * sizeof(uint64_t)) {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
} else if (alignment == sizeof(uint64_t)) {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
} else if (alignment == sizeof(uint32_t)) {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
} else if (alignment == sizeof(uint16_t)) {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
} else {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
}
const size_t localWorkSize = 256;
size_t globalWorkSize =
std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern
if (packed_obj.pattern_expanded_) {
memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size);
} else {
memcpy(constBuf, pattern, kpattern_size);
}
gpuCB.unmap(&gpu());
Memory* pGpuCB = &gpuCB;
setArgument(kernels_[kFillType], 5, sizeof(cl_mem), &pGpuCB);
uint64_t offset = origin[0];
// Adjust the pattern size in the copy type size
kpattern_size /= alignment;
setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
koffset /= alignment;
setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
// Calculate max id
kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
uint32_t next_chunk = globalWorkSize * kpattern_size;
setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = kernels_[kFillType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters);
gpu().xferWrite().Release(gpuCB);
}
Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
memcpy(constBuf, pattern, patternSize);
gpuCB.unmap(&gpu());
Memory* pGpuCB = &gpuCB;
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &pGpuCB);
uint64_t offset = origin[0];
patternSize/= alignment;
offset /= alignment;
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
setArgument(kernels_[fillType], 6, sizeof(offset), &offset);
setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = kernels_[fillType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
gpu().xferWrite().Release(gpuCB);
}
synchronize();
+11 -1
Ver ficheiro
@@ -148,6 +148,8 @@ Settings::Settings() {
prepinnedMinSize_ = 0;
cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG;
limit_blit_wg_ = 16;
}
bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -288,7 +290,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
// Cap at OpenCL20 for now
if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
use64BitPtr_ = LP64_SWITCH(false, true);
if (oclVersion_ >= OpenCL20) {
@@ -435,6 +437,10 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
prepinnedMinSize_ = PAL_PREPINNED_MEMORY_SIZE * Ki;
}
limit_blit_wg_ = enableWgpMode_
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
: palProp.gfxipProperties.shaderCore.numAvailableCus;
// Override current device settings
override();
@@ -501,6 +507,10 @@ void Settings::override() {
if (!flagIsDefault(PAL_ALWAYS_RESIDENT)) {
alwaysResident_ = PAL_ALWAYS_RESIDENT;
}
if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
}
}
} // namespace pal
+1
Ver ficheiro
@@ -118,6 +118,7 @@ class Settings : public device::Settings {
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
size_t prepinnedMinSize_; //!< minimal memory size for prepinned transfer
uint32_t limit_blit_wg_; //!< The number of workgroups for blit execution
//! Default constructor
Settings();
+3
Ver ficheiro
@@ -778,6 +778,9 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
// Add mask array for AmdAqlWrap slots
allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
// Align size to 64 bytes for more efficient fill operation
allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));
virtualQueue_ = new Memory(dev(), allocSize);
Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
+57 -45
Ver ficheiro
@@ -2048,9 +2048,10 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
}
// ================================================================================================
bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern, size_t patternSize,
const amd::Coord3D& surface, const amd::Coord3D& origin,
const amd::Coord3D& size, bool entire, bool forceBlit) const {
bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern,
size_t patternSize, const amd::Coord3D& surface,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, bool forceBlit) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
@@ -2063,79 +2064,90 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
synchronize();
return result;
} else {
// Pack the fill buffer info, that handles unaligned memories.
std::vector<FillBufferInfo> packed_vector{};
FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector);
size_t overall_offset = origin[0];
for (auto& packed_obj: packed_vector) {
uint fillType = FillBufferAligned;
uint32_t kpattern_size32 = (packed_obj.pattern_expanded_) ? sizeof(size_t) : patternSize;
size_t kfill_size = packed_obj.fill_size_/kpattern_size32;
constexpr uint32_t kFillType = FillBufferAligned;
uint32_t kpattern_size = (packed_obj.pattern_expanded_) ?
HostBlitManager::FillBufferInfo::kExtendedSize : patternSize;
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
size_t koffset = overall_offset;
overall_offset += packed_obj.fill_size_;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize = amd::alignUp(kfill_size, 256);
size_t localWorkSize = 256;
uint32_t alignment = (kpattern_size32 & 0x7) == 0 ?
sizeof(uint64_t) :
(kpattern_size32 & 0x3) == 0 ?
sizeof(uint32_t) :
(kpattern_size32 & 0x1) == 0 ?
sizeof(uint16_t) : sizeof(uint8_t);
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) :
(kpattern_size & 0x7) == 0 ? sizeof(uint64_t) :
(kpattern_size & 0x3) == 0 ? sizeof(uint32_t) :
(kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
// Program kernels arguments for the fill operation
cl_mem mem = as_cl<amd::Memory>(memory.owner());
if (alignment == sizeof(uint64_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
if (alignment == 2 * sizeof(uint64_t)) {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
} else if (alignment == sizeof(uint64_t)) {
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
} else if (alignment == sizeof(uint32_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
} else if (alignment == sizeof(uint16_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
} else {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
}
const size_t localWorkSize = 256;
size_t globalWorkSize =
std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
if (packed_obj.pattern_expanded_) {
memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size32);
memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size);
} else {
memcpy(constBuf, pattern, kpattern_size32);
memcpy(constBuf, pattern, kpattern_size);
}
constexpr bool kDirectVa = true;
setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
setArgument(kernels_[kFillType], 5, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
// Adjust the pattern size in the copy type size
kpattern_size /= alignment;
setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
koffset /= alignment;
kpattern_size32 /= alignment;
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &kpattern_size32);
setArgument(kernels_[fillType], 6, sizeof(koffset), &koffset);
setArgument(kernels_[fillType], 7, sizeof(kfill_size), &kfill_size);
setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
// Calculate max id
kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
uint32_t next_chunk = globalWorkSize * kpattern_size;
setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = captureArguments(kernels_[fillType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr);
address parameters = captureArguments(kernels_[kFillType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters, nullptr);
releaseArguments(parameters);
}
}
+4 -1
Ver ficheiro
@@ -769,7 +769,10 @@ bool Device::create() {
pciDeviceId_);
return false;
}
hsaSettings->limit_blit_wg_ = info().maxComputeUnits_;
if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
hsaSettings->limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
}
amd::Context::Info info = {0};
std::vector<amd::Device*> devices;
devices.push_back(this);
+1
Ver ficheiro
@@ -96,6 +96,7 @@ Settings::Settings() {
host_hdp_flush_ = true;
gwsInitSupported_ = true;
limit_blit_wg_ = 16;
}
// ================================================================================================
+1
Ver ficheiro
@@ -76,6 +76,7 @@ class Settings : public device::Settings {
size_t sdmaCopyThreshold_; //!< Use SDMA to copy above this size
uint32_t hmmFlags_; //!< HMM functionality control flags
uint32_t limit_blit_wg_; //!< The number of workgroups for blit execution
//! Default constructor
Settings();
+2 -2
Ver ficheiro
@@ -2828,8 +2828,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
// Add mask array for AmdAqlWrap slots
allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
// Make sure the allocation size aligns with DWORD.
allocSize = amd::alignUp(allocSize, sizeof(uint64_t));
// Align size to 64 bytes for more efficient fill operation
allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));
// CL_MEM_ALLOC_HOST_PTR/CL_MEM_READ_WRITE
virtualQueue_ = new (dev().context()) amd::Buffer(dev().context(), CL_MEM_READ_WRITE, allocSize);
+2
Ver ficheiro
@@ -221,6 +221,8 @@ release(uint, ROC_AQL_QUEUE_SIZE, 16384, \
"AQL queue size in AQL packets") \
release(uint, ROC_SIGNAL_POOL_SIZE, 32, \
"Initial size of HSA signal pool") \
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
"Limit the number of workgroups in blit operations") \
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
"If true, then runtime can skip kernel arg copy") \
release(bool, GPU_STREAMOPS_CP_WAIT, false, \