diff --git a/projects/clr/rocclr/device/blit.cpp b/projects/clr/rocclr/device/blit.cpp index a2c25376b4..33b04c5987 100644 --- a/projects/clr/rocclr/device/blit.cpp +++ b/projects/clr/rocclr/device/blit.cpp @@ -344,7 +344,7 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, - bool entire, size_t rowPitch, size_t slicePitch, + bool entire, size_t rowPitch, size_t slicePitch, amd::CopyMetadata copyMetadata) const { size_t startLayer = srcOrigin[2]; size_t numLayers = size[2]; @@ -475,7 +475,7 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, + const amd::Coord3D& size, bool entire, amd::CopyMetadata copyMetadata) const { size_t startLayer = srcOrigin[2]; size_t numLayers = size[2]; @@ -685,49 +685,41 @@ uint32_t HostBlitManager::sRGBmap(float fc) const { return (uint32_t)(c * 255.0 + 0.5); } -bool HostBlitManager::FillBufferInfo::ExpandPattern64(uint64_t pattern, size_t pattern_size, - uint64_t& pattern64) { +// ================================================================================================ +void HostBlitManager::FillBufferInfo::ExpandPattern(uint32_t pattern_size, const void* pattern) { + // If pattern size exceeds extended, then runtime will select the normal path + if (pattern_size >= kExtendedSize) { + return; + } - bool retval = true; - - do { - - // If the pattern is 0 or if the pattern_size is same as max size. - if (pattern == 0 || pattern_size == sizeof(uint64_t)) { - pattern64 = pattern; - break; + pattern_expanded_ = true; + if (pattern_size == sizeof(uint8_t)) { + uint8_t pattern_byte = *reinterpret_cast(pattern); + for (uint32_t i = 0; i < kExtendedSize; ++i) { + reinterpret_cast(expanded_pattern_)[i] = pattern_byte; } - - // Clean Curr_pattern, since it was casted off from const void* with a lesser size than size_t. - ClearBits64(pattern, (pattern_size * 8)); - pattern64 = 0; - - if (pattern_size == sizeof(uint8_t)) { - pattern = pattern & 0xff; - pattern64 = ((pattern << 56) | (pattern << 48) | (pattern << 40) | (pattern << 32) - | (pattern << 24) | (pattern << 16) | (pattern << 8) | (pattern)); - } else if (pattern_size == sizeof(uint16_t)) { - pattern = pattern & 0xffff; - pattern64 = ((pattern << 48) | (pattern << 32) | (pattern << 16) | (pattern)); - } else if (pattern_size == sizeof(uint32_t)) { - pattern = pattern & 0xffffffff; - pattern64 = ((pattern << 32) | (pattern)); - } else { - LogPrintfError("Unsupported Pattern size: %u \n", pattern_size); - retval = false; - break; + } else if (pattern_size == sizeof(uint16_t)) { + uint16_t pattern_word = *reinterpret_cast(pattern); + for (uint32_t i = 0; i < kExtendedSize / sizeof(uint16_t); ++i) { + reinterpret_cast(expanded_pattern_)[i] = pattern_word; } - - } while (0); - - return retval; + } else if (pattern_size == sizeof(uint32_t)) { + uint32_t pattern_dword = *reinterpret_cast(pattern); + for (uint32_t i = 0; i < kExtendedSize / sizeof(uint32_t); ++i) { + reinterpret_cast(expanded_pattern_)[i] = pattern_dword; + } + } else { + uint64_t pattern_qword = *reinterpret_cast(pattern); + reinterpret_cast(expanded_pattern_)[0] = pattern_qword; + reinterpret_cast(expanded_pattern_)[1] = pattern_qword; + } } -bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size, - size_t fill_origin, const void* pattern_ptr, - size_t pattern_size, - std::vector& packed_info) { - +// ================================================================================================ +void HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size, + size_t fill_origin, const void* pattern_ptr, + size_t pattern_size, + std::vector& packed_info) { // 1. Validate input arguments guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n", pattern_size, fill_size); @@ -736,60 +728,42 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz // 2. Calculate the next closest dword aligned address for faster processing size_t dst_addr = memory.virtualAddress() + fill_origin; - size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t)); + size_t aligned_dst_addr = amd::alignUp(dst_addr, kExtendedSize); guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination" "address :%u \n", aligned_dst_addr, dst_addr); // 3. If given address is not aligned calculate head and tail size. size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size); - size_t aligned_size = ((fill_size - head_size) / sizeof(size_t)) * sizeof(size_t); - size_t tail_size = (fill_size - head_size) % sizeof(size_t); + size_t aligned_size = ((fill_size - head_size) / kExtendedSize) * kExtendedSize; + size_t tail_size = (fill_size - head_size) % kExtendedSize; guarantee((head_size + aligned_size + tail_size) <= fill_size, "Head size, aligned size & tail" "size together cannot cross fill size"); - // 4. Clear unwanted bytes from the pattern if the pattern size is < sizeof(size_t). - uint64_t pattern = *(reinterpret_cast(const_cast(pattern_ptr))); - if (pattern_size < sizeof(uint64_t)) { - ClearBits64(pattern, (pattern_size * 8)); - } - - // 5. Fill the head, aligned, tail info if they exist. - FillBufferInfo fill_info; + // 4. Fill the head, aligned, tail info if they exist. if (head_size > 0) { // Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern. guarantee((head_size % pattern_size) == 0, "Offseted ptr should align with pattern_size"); - fill_info.fill_size_ = head_size; + FillBufferInfo fill_info(head_size); packed_info.push_back(fill_info); } - fill_info.clearInfo(); if (aligned_size > 0) { // Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern. guarantee((aligned_size % pattern_size) == 0, "Offseted ptr should align with pattern_size"); - if (pattern_size < sizeof(uint64_t)) { - if (!ExpandPattern64(pattern, pattern_size, fill_info.expanded_pattern_)) { - DevLogPrintfError("Failed Expanding the pattern for pattern:%u, pattern_size: %u", - pattern, pattern_size); - return false; - } - fill_info.pattern_expanded_ = true; - } - fill_info.fill_size_ = aligned_size; + FillBufferInfo fill_info(aligned_size); + fill_info.ExpandPattern(pattern_size, pattern_ptr); packed_info.push_back(fill_info); } - fill_info.clearInfo(); if (tail_size > 0) { // Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern. guarantee((tail_size % pattern_size) == 0, "Offseted ptr should align with pattern_size"); - fill_info.fill_size_ = tail_size; + FillBufferInfo fill_info(tail_size); packed_info.push_back(fill_info); } - fill_info.clearInfo(); - - return true; } + } // namespace gpu diff --git a/projects/clr/rocclr/device/blit.hpp b/projects/clr/rocclr/device/blit.hpp index 21b7b37c76..e5cf93526a 100644 --- a/projects/clr/rocclr/device/blit.hpp +++ b/projects/clr/rocclr/device/blit.hpp @@ -78,7 +78,7 @@ class BlitManager : public amd::HeapObject { const amd::Coord3D& origin, //!< Source origin const amd::Coord3D& size, //!< Size of the copy region bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata()//!< Memory copy MetaData ) const = 0; @@ -89,7 +89,7 @@ class BlitManager : public amd::HeapObject { const amd::BufferRect& hostRect, //!< Destination rectangle const amd::Coord3D& size, //!< Size of the copy region bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const = 0; @@ -101,7 +101,7 @@ class BlitManager : public amd::HeapObject { size_t rowPitch, //!< Row pitch for host memory size_t slicePitch, //!< Slice pitch for host memory bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata()//!< Memory copy MetaData ) const = 0; @@ -111,7 +111,7 @@ class BlitManager : public amd::HeapObject { const amd::Coord3D& origin, //!< Destination origin const amd::Coord3D& size, //!< Size of the copy region bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const = 0; @@ -122,7 +122,7 @@ class BlitManager : public amd::HeapObject { const amd::BufferRect& bufRect, //!< Source rectangle const amd::Coord3D& size, //!< Size of the copy region bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const = 0; @@ -134,7 +134,7 @@ class BlitManager : public amd::HeapObject { size_t rowPitch, //!< Row pitch for host memory size_t slicePitch, //!< Slice pitch for host memory bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const = 0; @@ -193,7 +193,7 @@ class BlitManager : public amd::HeapObject { const amd::Coord3D& dstOrigin, //!< Destination origin const amd::Coord3D& size, //!< Size of the copy region bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const = 0; @@ -301,7 +301,7 @@ class HostBlitManager : public device::BlitManager { size_t rowPitch, //!< Row pitch for host memory size_t slicePitch, //!< Slice pitch for host memory bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const; @@ -334,7 +334,7 @@ class HostBlitManager : public device::BlitManager { size_t rowPitch, //!< Row pitch for host memory size_t slicePitch, //!< Slice pitch for host memory bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const; @@ -393,7 +393,7 @@ class HostBlitManager : public device::BlitManager { const amd::Coord3D& dstOrigin, //!< Destination origin const amd::Coord3D& size, //!< Size of the copy region bool entire = false, //!< Entire buffer will be updated - amd::CopyMetadata copyMetadata = + amd::CopyMetadata copyMetadata = amd::CopyMetadata() //!< Memory copy MetaData ) const; @@ -423,36 +423,24 @@ class HostBlitManager : public device::BlitManager { const amd::Device& dev_; //!< Physical device // Packed Fill Buffer - class FillBufferInfo { - public: - FillBufferInfo(): fill_size_(0), expanded_pattern_(0), pattern_expanded_(false) {} + struct FillBufferInfo { + static constexpr uint32_t kExtendedSize = 2 * sizeof(uint64_t); - static bool PackInfo(const device::Memory& memory, size_t fill_size, - size_t fill_origin, const void* pattern, size_t pattern_size, - std::vector& packed_info); - - private: - static bool ExpandPattern64(uint64_t pattern, size_t pattern_size, uint64_t& pattern64); - - static inline void ClearBits64(uint64_t& pattern, uint64_t num_bits) { - pattern &= ~(~(static_cast(0)) << num_bits); + static void PackInfo(const device::Memory& memory, size_t fill_size, + size_t fill_origin, const void* pattern, size_t pattern_size, + std::vector& packed_info); + FillBufferInfo(size_t fill_size): fill_size_(fill_size), pattern_expanded_(false) { + memset(&expanded_pattern_, 0, sizeof(expanded_pattern_)); } - void clearInfo () { - fill_size_ = 0; - expanded_pattern_ = 0; - pattern_expanded_ = false; - } + void ExpandPattern(uint32_t pattern_size, const void* pattern); - public: - size_t fill_size_; // Fill size for this command - uint64_t expanded_pattern_; // Pattern for this command - bool pattern_expanded_; // Boolean to check if pattern is expanded + size_t fill_size_; //!< Fill size for this command + uint8_t expanded_pattern_[kExtendedSize]; //!< Pattern for this command - 16 bytes + bool pattern_expanded_; //!< Boolean to check if pattern is expanded }; - - private: //! Disable copy constructor HostBlitManager(const HostBlitManager&); diff --git a/projects/clr/rocclr/device/blitcl.cpp b/projects/clr/rocclr/device/blitcl.cpp index dbcd8e475a..f2dd110d35 100644 --- a/projects/clr/rocclr/device/blitcl.cpp +++ b/projects/clr/rocclr/device/blitcl.cpp @@ -47,16 +47,57 @@ const char* BlitLinearSourceCode = BLIT_KERNELS( extern void __ockl_gws_init(uint nwm1, uint rid); - // Implementation - __kernel void __amd_rocclr_fillBufferAligned(__global uchar* bufUChar, - __global ushort* bufUShort, - __global uint* bufUInt, - __global ulong* bufULong, - __constant uchar* pattern, - uint patternSize, ulong offset, - ulong size) { - __amd_fillBufferAligned(bufUChar, bufUShort, bufUInt, bufULong, - pattern, patternSize, offset, size); + __kernel void __amd_rocclr_fillBufferAligned( + __global uchar* bufUChar, __global ushort* bufUShort, __global uint* bufUInt, + __global ulong* bufULong, __global ulong2* bufULong2, __constant uchar* pattern, + uint pattern_size, ulong offset, ulong end_ptr, uint next_chunk) { + int id = get_global_id(0); + long cur_id = offset + id * pattern_size; + if (bufULong2) { + __global ulong2* element = &bufULong2[cur_id]; + __constant ulong2* pt = (__constant ulong2*)pattern; + while ((ulong)element < end_ptr) { + for (uint i = 0; i < pattern_size; ++i) { + element[i] = pt[i]; + } + element += next_chunk; + } + } else if (bufULong) { + __global ulong* element = &bufULong[cur_id]; + __constant ulong* pt = (__constant ulong*)pattern; + while ((ulong)element < end_ptr) { + for (uint i = 0; i < pattern_size; ++i) { + element[i] = pt[i]; + } + element += next_chunk; + } + } else if (bufUInt) { + __global uint* element = &bufUInt[cur_id]; + __constant uint* pt = (__constant uint*)pattern; + while ((ulong)element < end_ptr) { + for (uint i = 0; i < pattern_size; ++i) { + element[i] = pt[i]; + } + element += next_chunk; + } + } else if (bufUShort) { + __global ushort* element = &bufUShort[cur_id]; + __constant ushort* pt = (__constant ushort*)pattern; + while ((ulong)element < end_ptr) { + for (uint i = 0; i < pattern_size; ++i) { + element[i] = pt[i]; + } + element += next_chunk; + } + } else { + __global uchar* element = &bufUChar[cur_id]; + while ((ulong)element < end_ptr) { + for (uint i = 0; i < pattern_size; ++i) { + element[i] = pattern[i]; + } + element += next_chunk; + } + } } __kernel void __amd_rocclr_fillBufferAligned2D(__global uchar* bufUChar, diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index 9d88b5e84a..bfb078ada0 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -2172,63 +2172,96 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, synchronize(); return result; } else { - uint fillType = FillBufferAligned; - size_t globalWorkOffset[3] = {0, 0, 0}; - uint64_t fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - uint32_t alignment = (patternSize & 0x7) == 0 ? - sizeof(uint64_t) : - (patternSize & 0x3) == 0 ? - sizeof(uint32_t) : - (patternSize & 0x1) == 0 ? - sizeof(uint16_t) : sizeof(uint8_t); + // Pack the fill buffer info, that handles unaligned memories. + std::vector packed_vector{}; + FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector); - // Program kernels arguments for the fill operation - Memory* mem = &gpuMem(memory); - if (alignment == sizeof(uint64_t)) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem); - } else if (alignment == sizeof(uint32_t)) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); - } else if (alignment == sizeof(uint16_t)) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); - } else { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); + size_t overall_offset = origin[0]; + for (auto& packed_obj : packed_vector) { + constexpr uint32_t kFillType = FillBufferAligned; + uint32_t kpattern_size = (packed_obj.pattern_expanded_) + ? HostBlitManager::FillBufferInfo::kExtendedSize + : patternSize; + size_t kfill_size = packed_obj.fill_size_ / kpattern_size; + size_t koffset = overall_offset; + overall_offset += packed_obj.fill_size_; + + size_t globalWorkOffset[3] = {0, 0, 0}; + uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) : + (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) : + (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) : + (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t); + + // Program kernels arguments for the fill operation + Memory* mem = &gpuMem(memory); + if (alignment == 2 * sizeof(uint64_t)) { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem); + } else if (alignment == sizeof(uint64_t)) { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); + } else if (alignment == sizeof(uint32_t)) { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); + } else if (alignment == sizeof(uint16_t)) { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); + } else { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); + } + const size_t localWorkSize = 256; + size_t globalWorkSize = + std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size); + globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize); + + Memory& gpuCB = gpu().xferWrite().Acquire(patternSize); + void* constBuf = gpuCB.map(&gpu(), Resource::NoWait); + // If pattern has been expanded, use the expanded pattern, otherwise use the default pattern + if (packed_obj.pattern_expanded_) { + memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size); + } else { + memcpy(constBuf, pattern, kpattern_size); + } + gpuCB.unmap(&gpu()); + Memory* pGpuCB = &gpuCB; + setArgument(kernels_[kFillType], 5, sizeof(cl_mem), &pGpuCB); + uint64_t offset = origin[0]; + + // Adjust the pattern size in the copy type size + kpattern_size /= alignment; + setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size); + koffset /= alignment; + setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset); + // Calculate max id + kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment; + setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size); + uint32_t next_chunk = globalWorkSize * kpattern_size; + setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); + + // Execute the blit + address parameters = kernels_[kFillType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters); + gpu().xferWrite().Release(gpuCB); } - Memory& gpuCB = gpu().xferWrite().Acquire(patternSize); - void* constBuf = gpuCB.map(&gpu(), Resource::NoWait); - memcpy(constBuf, pattern, patternSize); - gpuCB.unmap(&gpu()); - Memory* pGpuCB = &gpuCB; - setArgument(kernels_[fillType], 4, sizeof(cl_mem), &pGpuCB); - uint64_t offset = origin[0]; - - patternSize/= alignment; - offset /= alignment; - - setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize); - setArgument(kernels_[fillType], 6, sizeof(offset), &offset); - setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); - gpu().xferWrite().Release(gpuCB); } synchronize(); diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp index aecb4fde84..704ef0292e 100644 --- a/projects/clr/rocclr/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/device/pal/palsettings.cpp @@ -148,6 +148,8 @@ Settings::Settings() { prepinnedMinSize_ = 0; cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki; useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG; + + limit_blit_wg_ = 16; } bool Settings::create(const Pal::DeviceProperties& palProp, @@ -288,7 +290,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp, // Cap at OpenCL20 for now if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20; - + use64BitPtr_ = LP64_SWITCH(false, true); if (oclVersion_ >= OpenCL20) { @@ -435,6 +437,10 @@ bool Settings::create(const Pal::DeviceProperties& palProp, prepinnedMinSize_ = PAL_PREPINNED_MEMORY_SIZE * Ki; } + limit_blit_wg_ = enableWgpMode_ + ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2 + : palProp.gfxipProperties.shaderCore.numAvailableCus; + // Override current device settings override(); @@ -501,6 +507,10 @@ void Settings::override() { if (!flagIsDefault(PAL_ALWAYS_RESIDENT)) { alwaysResident_ = PAL_ALWAYS_RESIDENT; } + + if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) { + limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U); + } } } // namespace pal diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp index cb8e3ce890..33448c9211 100644 --- a/projects/clr/rocclr/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/device/pal/palsettings.hpp @@ -118,6 +118,7 @@ class Settings : public device::Settings { amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler size_t prepinnedMinSize_; //!< minimal memory size for prepinned transfer + uint32_t limit_blit_wg_; //!< The number of workgroups for blit execution //! Default constructor Settings(); diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 2e4d23c72d..b38c210de8 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -778,6 +778,9 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { // Add mask array for AmdAqlWrap slots allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; + // Align size to 64 bytes for more efficient fill operation + allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t)); + virtualQueue_ = new Memory(dev(), allocSize); Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote; if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 5d8c4f77b9..4cfd22a51e 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -2048,9 +2048,10 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, } // ================================================================================================ -bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern, size_t patternSize, - const amd::Coord3D& surface, const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire, bool forceBlit) const { +bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern, + size_t patternSize, const amd::Coord3D& surface, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire, bool forceBlit) const { amd::ScopedLock k(lockXferOps_); bool result = false; @@ -2063,79 +2064,90 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern synchronize(); return result; } else { - // Pack the fill buffer info, that handles unaligned memories. std::vector packed_vector{}; FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector); size_t overall_offset = origin[0]; for (auto& packed_obj: packed_vector) { - uint fillType = FillBufferAligned; - - uint32_t kpattern_size32 = (packed_obj.pattern_expanded_) ? sizeof(size_t) : patternSize; - size_t kfill_size = packed_obj.fill_size_/kpattern_size32; + constexpr uint32_t kFillType = FillBufferAligned; + uint32_t kpattern_size = (packed_obj.pattern_expanded_) ? + HostBlitManager::FillBufferInfo::kExtendedSize : patternSize; + size_t kfill_size = packed_obj.fill_size_ / kpattern_size; size_t koffset = overall_offset; overall_offset += packed_obj.fill_size_; size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize = amd::alignUp(kfill_size, 256); - size_t localWorkSize = 256; - - uint32_t alignment = (kpattern_size32 & 0x7) == 0 ? - sizeof(uint64_t) : - (kpattern_size32 & 0x3) == 0 ? - sizeof(uint32_t) : - (kpattern_size32 & 0x1) == 0 ? - sizeof(uint16_t) : sizeof(uint8_t); - + uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) : + (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) : + (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) : + (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t); // Program kernels arguments for the fill operation cl_mem mem = as_cl(memory.owner()); - if (alignment == sizeof(uint64_t)) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem); + if (alignment == 2 * sizeof(uint64_t)) { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem); + } else if (alignment == sizeof(uint64_t)) { + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); } else if (alignment == sizeof(uint32_t)) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); } else if (alignment == sizeof(uint16_t)) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); } else { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr); + setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr); } + const size_t localWorkSize = 256; + size_t globalWorkSize = + std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size); + globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize); auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment); // If pattern has been expanded, use the expanded pattern, otherwise use the default pattern. if (packed_obj.pattern_expanded_) { - memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size32); + memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size); } else { - memcpy(constBuf, pattern, kpattern_size32); + memcpy(constBuf, pattern, kpattern_size); } constexpr bool kDirectVa = true; - setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa); + setArgument(kernels_[kFillType], 5, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa); + // Adjust the pattern size in the copy type size + kpattern_size /= alignment; + setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size); koffset /= alignment; - kpattern_size32 /= alignment; - - setArgument(kernels_[fillType], 5, sizeof(uint32_t), &kpattern_size32); - setArgument(kernels_[fillType], 6, sizeof(koffset), &koffset); - setArgument(kernels_[fillType], 7, sizeof(kfill_size), &kfill_size); + setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset); + // Calculate max id + kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment; + setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size); + uint32_t next_chunk = globalWorkSize * kpattern_size; + setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk); // Create ND range object for the kernel's execution amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); // Execute the blit - address parameters = captureArguments(kernels_[fillType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr); + address parameters = captureArguments(kernels_[kFillType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters, nullptr); releaseArguments(parameters); } } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 86a102d441..f5b6597f71 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -769,7 +769,10 @@ bool Device::create() { pciDeviceId_); return false; } - + hsaSettings->limit_blit_wg_ = info().maxComputeUnits_; + if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) { + hsaSettings->limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U); + } amd::Context::Info info = {0}; std::vector devices; devices.push_back(this); diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index cbd929cd34..16368b06b7 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -96,6 +96,7 @@ Settings::Settings() { host_hdp_flush_ = true; gwsInitSupported_ = true; + limit_blit_wg_ = 16; } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index ced98dc03d..fb7b679fb0 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -76,6 +76,7 @@ class Settings : public device::Settings { size_t sdmaCopyThreshold_; //!< Use SDMA to copy above this size uint32_t hmmFlags_; //!< HMM functionality control flags + uint32_t limit_blit_wg_; //!< The number of workgroups for blit execution //! Default constructor Settings(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 09674b96fd..609fb4cf7a 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2828,8 +2828,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) // Add mask array for AmdAqlWrap slots allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; - // Make sure the allocation size aligns with DWORD. - allocSize = amd::alignUp(allocSize, sizeof(uint64_t)); + // Align size to 64 bytes for more efficient fill operation + allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t)); // CL_MEM_ALLOC_HOST_PTR/CL_MEM_READ_WRITE virtualQueue_ = new (dev().context()) amd::Buffer(dev().context(), CL_MEM_READ_WRITE, allocSize); diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 5c7ff852a0..488ac8e315 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -221,6 +221,8 @@ release(uint, ROC_AQL_QUEUE_SIZE, 16384, \ "AQL queue size in AQL packets") \ release(uint, ROC_SIGNAL_POOL_SIZE, 32, \ "Initial size of HSA signal pool") \ +release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \ + "Limit the number of workgroups in blit operations") \ release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \ "If true, then runtime can skip kernel arg copy") \ release(bool, GPU_STREAMOPS_CP_WAIT, false, \