P4 to Git Change 1981122 by gandryey@gera-win10 on 2019/08/09 17:59:31
SWDEV-79445 - OCL generic changes and code clean-up - Allow async execution with scratch on the same queue. COMPUTE_TMPRING_SIZE.WAVESIZE should be constant across all dispatches. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#151 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#40 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#145 edit
Этот коммит содержится в:
@@ -2011,6 +2011,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
|
||||
ScopedLockVgpus lock(*this);
|
||||
|
||||
scratch_[sb]->size_ = newSize;
|
||||
scratch_[sb]->privateMemSize_ = regNum * sizeof(uint32_t);
|
||||
|
||||
uint64_t size = 0;
|
||||
uint64_t offset = 0;
|
||||
|
||||
@@ -2064,8 +2066,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Device::validateKernel(
|
||||
const amd::Kernel& kernel, const device::VirtualDevice* vdev, bool coop_groups) {
|
||||
bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev,
|
||||
bool coop_groups) {
|
||||
// Find the number of scratch registers used in the kernel
|
||||
const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
|
||||
uint regNum = static_cast<uint>(devKernel->workGroupInfo()->scratchRegs_);
|
||||
@@ -2239,33 +2241,30 @@ void Device::svmFree(void* ptr) const {
|
||||
}
|
||||
|
||||
bool Device::AcquireExclusiveGpuAccess() {
|
||||
// Lock the virtual GPU list
|
||||
vgpusAccess().lock();
|
||||
// Lock the virtual GPU list
|
||||
vgpusAccess().lock();
|
||||
|
||||
// Find all available virtual GPUs and lock them
|
||||
// from the execution of commands
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->execution().lock();
|
||||
// Make sure a wait is done
|
||||
vgpus()[idx]->WaitForIdleCompute();
|
||||
}
|
||||
// if (!hsa_exclusive_gpu_access_) {
|
||||
// @todo call rocr
|
||||
// hsa_exclusive_gpu_access_ = true;
|
||||
// }
|
||||
return true;
|
||||
// Find all available virtual GPUs and lock them
|
||||
// from the execution of commands
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->execution().lock();
|
||||
// Make sure a wait is done
|
||||
vgpus()[idx]->WaitForIdleCompute();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
|
||||
vgpu.WaitForIdleCompute();
|
||||
// Find all available virtual GPUs and unlock them
|
||||
// for the execution of commands
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->execution().unlock();
|
||||
}
|
||||
vgpu.WaitForIdleCompute();
|
||||
// Find all available virtual GPUs and unlock them
|
||||
// for the execution of commands
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->execution().unlock();
|
||||
}
|
||||
|
||||
// Unock the virtual GPU list
|
||||
vgpusAccess().unlock();
|
||||
// Unock the virtual GPU list
|
||||
vgpusAccess().unlock();
|
||||
}
|
||||
|
||||
Device::SrdManager::~SrdManager() {
|
||||
|
||||
@@ -242,9 +242,10 @@ class Device : public NullDevice {
|
||||
Memory* memObj_; //!< Memory objects for scratch buffers
|
||||
uint64_t offset_; //!< Offset from the global scratch store
|
||||
uint64_t size_; //!< Scratch buffer size on this queue
|
||||
uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
|
||||
|
||||
//! Default constructor
|
||||
ScratchBuffer() : memObj_(NULL), offset_(0), size_(0) {}
|
||||
ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
|
||||
|
||||
//! Default constructor
|
||||
~ScratchBuffer();
|
||||
|
||||
@@ -2370,12 +2370,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
|
||||
dispatchParam.scratchSize = scratch->size_;
|
||||
dispatchParam.scratchOffset = scratch->offset_;
|
||||
// Use maximum available slots for all dispatches to allow async on the same queue
|
||||
// HW value loaded into SGPR is an offset value calculated as
|
||||
// wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
|
||||
dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
|
||||
}
|
||||
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
|
||||
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
|
||||
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
|
||||
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
|
||||
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
|
||||
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
|
||||
// Run AQL dispatch in HW
|
||||
eventBegin(MainEngine);
|
||||
|
||||
Ссылка в новой задаче
Block a user