P4 to Git Change 1981122 by gandryey@gera-win10 on 2019/08/09 17:59:31

SWDEV-79445 - OCL generic changes and code clean-up
	- Allow async execution with scratch on the same queue. COMPUTE_TMPRING_SIZE.WAVESIZE should be constant across all dispatches.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#151 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#40 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#145 edit
Этот коммит содержится в:
foreman
2019-08-09 18:04:15 -04:00
родитель 2fbff434ba
Коммит 0a2d4e2dc3
3 изменённых файлов: 29 добавлений и 26 удалений
+23 -24
Просмотреть файл
@@ -2011,6 +2011,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
ScopedLockVgpus lock(*this);
scratch_[sb]->size_ = newSize;
scratch_[sb]->privateMemSize_ = regNum * sizeof(uint32_t);
uint64_t size = 0;
uint64_t offset = 0;
@@ -2064,8 +2066,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu, uint vgprs) {
return true;
}
bool Device::validateKernel(
const amd::Kernel& kernel, const device::VirtualDevice* vdev, bool coop_groups) {
bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev,
bool coop_groups) {
// Find the number of scratch registers used in the kernel
const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
uint regNum = static_cast<uint>(devKernel->workGroupInfo()->scratchRegs_);
@@ -2239,33 +2241,30 @@ void Device::svmFree(void* ptr) const {
}
bool Device::AcquireExclusiveGpuAccess() {
// Lock the virtual GPU list
vgpusAccess().lock();
// Lock the virtual GPU list
vgpusAccess().lock();
// Find all available virtual GPUs and lock them
// from the execution of commands
for (uint idx = 0; idx < vgpus().size(); ++idx) {
vgpus()[idx]->execution().lock();
// Make sure a wait is done
vgpus()[idx]->WaitForIdleCompute();
}
// if (!hsa_exclusive_gpu_access_) {
// @todo call rocr
// hsa_exclusive_gpu_access_ = true;
// }
return true;
// Find all available virtual GPUs and lock them
// from the execution of commands
for (uint idx = 0; idx < vgpus().size(); ++idx) {
vgpus()[idx]->execution().lock();
// Make sure a wait is done
vgpus()[idx]->WaitForIdleCompute();
}
return true;
}
void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
vgpu.WaitForIdleCompute();
// Find all available virtual GPUs and unlock them
// for the execution of commands
for (uint idx = 0; idx < vgpus().size(); ++idx) {
vgpus()[idx]->execution().unlock();
}
vgpu.WaitForIdleCompute();
// Find all available virtual GPUs and unlock them
// for the execution of commands
for (uint idx = 0; idx < vgpus().size(); ++idx) {
vgpus()[idx]->execution().unlock();
}
// Unock the virtual GPU list
vgpusAccess().unlock();
// Unock the virtual GPU list
vgpusAccess().unlock();
}
Device::SrdManager::~SrdManager() {
+2 -1
Просмотреть файл
@@ -242,9 +242,10 @@ class Device : public NullDevice {
Memory* memObj_; //!< Memory objects for scratch buffers
uint64_t offset_; //!< Offset from the global scratch store
uint64_t size_; //!< Scratch buffer size on this queue
uint64_t privateMemSize_; //!< Private memory size per thread, allowed by the current scratch
//! Default constructor
ScratchBuffer() : memObj_(NULL), offset_(0), size_(0) {}
ScratchBuffer() : memObj_(nullptr), offset_(0), size_(0), privateMemSize_(0) {}
//! Default constructor
~ScratchBuffer();
+4 -1
Просмотреть файл
@@ -2370,12 +2370,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
dispatchParam.scratchSize = scratch->size_;
dispatchParam.scratchOffset = scratch->offset_;
// Use maximum available slots for all dispatches to allow async on the same queue
// HW value loaded into SGPR is an offset value calculated as
// wave_slot * COMPUTE_TMPRING_SIZE.WAVESIZE
dispatchParam.workitemPrivateSegmentSize = scratch->privateMemSize_;
}
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
// Run AQL dispatch in HW
eventBegin(MainEngine);