P4 to Git Change 1982729 by gandryey@gera-win10 on 2019/08/13 17:40:55
SWDEV-79445 - OCL generic changes and code clean-up - Use max number of waves per SIMD in the scratch calculation to allow async kernel execution with the scratch buffer Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#155 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#43 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#147 edit
このコミットが含まれているのは:
@@ -1996,7 +1996,9 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
|
||||
// Calculate the size of the scratch buffer for a queue
|
||||
uint32_t numTotalCUs = properties().gfxipProperties.shaderCore.numAvailableCus;
|
||||
// Find max waves based on VGPR per SIMD
|
||||
uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numAvailableVgprs / vgprs;
|
||||
// note: Select maximum to allow possible kernel async execution,
|
||||
// but optimal is (numAvailableVgprs / vgprs)
|
||||
uint32_t numMaxWaves = properties().gfxipProperties.shaderCore.numWavefrontsPerSimd;
|
||||
// Find max waves per CU
|
||||
numMaxWaves *= properties().gfxipProperties.shaderCore.numSimdsPerCu;
|
||||
// Find max waves per device
|
||||
@@ -2019,11 +2021,14 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
|
||||
ScratchBuffer* scratchBuf = scratch_[s];
|
||||
if (scratchBuf->size_ > 0) {
|
||||
scratchBuf->destroyMemory();
|
||||
scratchBuf->size_ = std::min(newSize, info().maxMemAllocSize_);
|
||||
scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi));
|
||||
// Note: Generic address space setup in HW requires 64KB alignment for scratch
|
||||
scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki);
|
||||
scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t);
|
||||
// Adjust the size for the current queue only
|
||||
if (s == sb) {
|
||||
scratchBuf->size_ = std::min(newSize, info().maxMemAllocSize_);
|
||||
scratchBuf->size_ = std::min(newSize, uint64_t(3 * Gi));
|
||||
// Note: Generic address space setup in HW requires 64KB alignment for scratch
|
||||
scratchBuf->size_ = amd::alignUp(newSize, 64 * Ki);
|
||||
scratchBuf->privateMemSize_ = regNum * sizeof(uint32_t);
|
||||
}
|
||||
scratchBuf->offset_ = offset;
|
||||
size += scratchBuf->size_;
|
||||
offset += scratchBuf->size_;
|
||||
|
||||
@@ -250,7 +250,7 @@ class Device : public NullDevice {
|
||||
Memory* memObj_; //!< Memory objects for scratch buffers
|
||||
uint64_t offset_; //!< Offset from the global scratch store
|
||||
uint64_t size_; //!< Scratch buffer size on this queue
|
||||
uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
|
||||
uint32_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
|
||||
|
||||
//! Default constructor
|
||||
ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
|
||||
|
||||
@@ -2444,7 +2444,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
// Use maximum available slots for all dispatches to allow async on the same queue
|
||||
// HW value loaded into SGPR is an offset value calculated as
|
||||
// wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
|
||||
dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
|
||||
dispatchParam.workitemPrivateSegmentSize = std::max(hsaKernel.spillSegSize(), scratch->privateMemSize_);
|
||||
}
|
||||
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
|
||||
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする