From b8a9cdb20e593b85142e85c0cb294c9504e271ca Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 30 May 2018 16:22:34 -0400
Subject: [PATCH] P4 to Git Change 1561794 by gandryey@gera-w8 on 2018/05/30
16:14:30
SWDEV-79445 - OCL generic changes and code clean-up
- Move memory sync logic to processMemObjectsHSA() to simplify the arguments setup
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#102 edit
---
rocclr/runtime/device/pal/palkernel.cpp | 45 ------
rocclr/runtime/device/pal/palvirtual.cpp | 197 +++++++++++++----------
2 files changed, 115 insertions(+), 127 deletions(-)
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 6c4b779ce2..c2a2be0c01 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -998,42 +998,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
assert(
(arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
"Unsupported address qualifier");
-
- // If it is a global pointer
- Memory* gpuMem = nullptr;
- amd::Memory* mem = nullptr;
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- if (nativeMem) {
- gpuMem = reinterpret_cast(memories)[index];
- if (nullptr != gpuMem) {
- mem = gpuMem->owner();
- }
- } else {
- mem = memories[index];
- if (mem != nullptr) {
- gpuMem = dev().getGpuMemory(mem);
- }
- }
-
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
- if (gpuMem == nullptr) {
- break;
- }
-
- // Wait for resource if it was used on an inactive engine
- //! \note syncCache may call DRM transfer
- gpuMem->wait(gpu, WaitOnBusyEngine);
-
- //! @todo Compiler has to return read/write attributes
- if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
- mem->signalWrite(&dev());
- }
- gpu.addVmMemory(gpuMem);
-
- // save the memory object pointer to allow global memory access
- if (nullptr != dev().hwDebugMgr()) {
- dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner());
- }
break;
}
case HSAIL_ARGTYPE_REFERENCE: {
@@ -1072,10 +1037,6 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
}
}
- // Wait for resource if it was used on an inactive engine
- //! \note syncCache may call DRM transfer
- image->wait(gpu, WaitOnBusyEngine);
-
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
@@ -1092,12 +1053,6 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
srdResource = true;
}
- //! @todo Compiler has to return read/write attributes
- if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
- mem->signalWrite(&dev());
- }
-
- gpu.addVmMemory(image);
if (image->desc().isDoppTexture_) {
gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
kernel.parameters().getExecPfpaVcop());
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index eef832ea60..da764c3cde 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -428,6 +428,9 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
uint64_t curEnd = curStart + memory->size();
if (memory->isModified(gpu) || !readOnly) {
+ // Mark resource as modified
+ memory->setModified(gpu, !readOnly);
+
// Loop through all memory objects in the queue and find dependency
// @note don't include objects from the current kernel
for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
@@ -473,8 +476,6 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
numMemObjectsInQueue_++;
- // Mark resource as modified
- memory->setModified(gpu, !readOnly);
}
void VirtualGPU::MemoryDependency::clear(bool all) {
@@ -490,8 +491,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
}
- // Clear all objects except current kernel
- memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_);
+ // Adjust the number of active objects
numMemObjectsInQueue_ -= endMemObjectsInQueue_;
endMemObjectsInQueue_ = 0;
}
@@ -1925,17 +1925,17 @@ bool VirtualGPU::PreDeviceEnqueue(
return false;
}
else {
- if (dev().settings().useDeviceQueue_) {
- *gpuDefQueue = static_cast(defQueue->vDev());
- if ((*gpuDefQueue)->hwRing() == hwRing()) {
- LogError("Can't submit the child kernels to the same HW ring as the host queue!");
- return false;
- }
- }
- else {
- createVirtualQueue(defQueue->size());
- *gpuDefQueue = this;
+ if (dev().settings().useDeviceQueue_) {
+ *gpuDefQueue = static_cast(defQueue->vDev());
+ if ((*gpuDefQueue)->hwRing() == hwRing()) {
+ LogError("Can't submit the child kernels to the same HW ring as the host queue!");
+ return false;
}
+ }
+ else {
+ createVirtualQueue(defQueue->size());
+ *gpuDefQueue = this;
+ }
}
*vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();
@@ -2084,12 +2084,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return false;
}
- // Check memory dependency and SVM objects
- if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
- LogError("Wrong memory objects!");
- return false;
- }
-
// Add ISA memory object to the resource tracking list
AddKernel(kernel);
@@ -2102,6 +2096,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
}
+ // Check memory dependency and SVM objects
+ if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
+ LogError("Wrong memory objects!");
+ return false;
+ }
+
bool needFlush = false;
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
// are in the same cmdBuffer
@@ -2151,10 +2151,33 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0],
&(const_cast(sizes).local()[0]));
+ if (iter > 0) {
+ // Updates the timestamp values, since a CB flush could occur.
+ // Resource processing was moved from loadArguments() and
+ // an extra loop is required.
+ const amd::KernelParameters& kernelParams = kernel.parameters();
+ amd::Memory* const* memories =
+ reinterpret_cast(parameters + kernelParams.memoryObjOffset());
+ for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
+ if (nativeMem) {
+ Memory* gpuMem = reinterpret_cast(memories)[i];
+ if (gpuMem != nullptr) {
+ gpuMem->setBusy(*this, gpuEvent);
+ }
+ }
+ else {
+ amd::Memory* mem = memories[i];
+ if (mem != nullptr) {
+ dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
+ }
+ }
+ }
+ }
+
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
- *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
+ *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -2909,88 +2932,87 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
bool nativeMem) {
- const HSAILKernel& hsaKernel =
- static_cast(*(kernel.getDeviceKernel(dev())));
- const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
- std::vector memList;
+
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
memoryDependency().newKernel();
- bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true);
- bool supportFineGrainedSystem = deviceSupportFGS;
- FGSStatus status = kernelParams.getSvmSystemPointersSupport();
- switch (status) {
- case FGS_YES:
- if (!deviceSupportFGS) {
- return false;
- }
- supportFineGrainedSystem = true;
- break;
- case FGS_NO:
- supportFineGrainedSystem = false;
- break;
- case FGS_DEFAULT:
- default:
- break;
- }
-
size_t count = kernelParams.getNumberOfSvmPtr();
- size_t execInfoOffset = kernelParams.getExecInfoOffset();
- bool sync = true;
-
- // get svm non arugment information
- void* const* svmPtrArray = reinterpret_cast(params + execInfoOffset);
- for (size_t i = 0; i < count; i++) {
- amd::Memory* memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
- if (nullptr == memory) {
- if (!supportFineGrainedSystem) {
- return false;
- } else if (sync) {
- addBarrier();
- // Clear memory dependency state
- const static bool All = true;
- memoryDependency().clear(!All);
- continue;
- }
- } else {
- Memory* gpuMemory = dev().getGpuMemory(memory);
- if (nullptr != gpuMemory) {
- // Synchronize data with other memory instances if necessary
- gpuMemory->syncCacheFromHost(*this);
-
- const static bool IsReadOnly = false;
- // Validate SVM passed in the non argument list
- memoryDependency().validate(*this, gpuMemory, IsReadOnly);
-
- // Mark signal write for cache coherency,
- // since this object isn't a part of kernel arg setup
- if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
- memory->signalWrite(&dev());
+ if (count > 0) {
+ bool supportFineGrainedSystem = dev().isFineGrainedSystem(true);
+ FGSStatus status = kernelParams.getSvmSystemPointersSupport();
+ switch (status) {
+ case FGS_YES:
+ if (!supportFineGrainedSystem) {
+ return false;
+ }
+ break;
+ case FGS_NO:
+ supportFineGrainedSystem = false;
+ break;
+ case FGS_DEFAULT:
+ default:
+ break;
+ }
+ // get svm non arugment information
+ void* const* svmPtrArray = reinterpret_cast(
+ params + kernelParams.getExecInfoOffset());
+ for (size_t i = 0; i < count; i++) {
+ amd::Memory* memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
+ if (nullptr == memory) {
+ if (!supportFineGrainedSystem) {
+ return false;
+ } else {
+ addBarrier();
+ // Clear memory dependency state
+ const static bool All = true;
+ memoryDependency().clear(!All);
+ continue;
}
-
- memList.push_back(gpuMemory);
} else {
- return false;
+ Memory* gpuMemory = dev().getGpuMemory(memory);
+ if (nullptr != gpuMemory) {
+ // Synchronize data with other memory instances if necessary
+ gpuMemory->syncCacheFromHost(*this);
+
+ const static bool IsReadOnly = false;
+ // Validate SVM passed in the non argument list
+ memoryDependency().validate(*this, gpuMemory, IsReadOnly);
+
+ // Wait for resource if it was used on an inactive engine
+ //! \note syncCache may call DRM transfer
+ constexpr bool WaitOnBusyEngine = true;
+ gpuMemory->wait(*this, WaitOnBusyEngine);
+
+ // Mark signal write for cache coherency,
+ // since this object isn't a part of kernel arg setup
+ if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
+ memory->signalWrite(&dev());
+ }
+ addVmMemory(gpuMemory);
+ } else {
+ return false;
+ }
}
}
}
- for (auto it : memList) {
- addVmMemory(it);
- }
amd::Memory* const* memories =
reinterpret_cast(params + kernelParams.memoryObjOffset());
+ const HSAILKernel& hsaKernel =
+ static_cast(*(kernel.getDeviceKernel(dev())));
+ const amd::KernelSignature& signature = kernel.signature();
+
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i);
- Memory* gpuMem = nullptr;
- amd::Memory* mem = nullptr;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
+ Memory* gpuMem = nullptr;
+ amd::Memory* mem = nullptr;
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast(memories)[index];
@@ -3019,6 +3041,17 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, gpuMem, readOnly);
+
+ // Wait for resource if it was used on an inactive engine
+ //! \note syncCache may call DRM transfer
+ constexpr bool WaitOnBusyEngine = true;
+ gpuMem->wait(*this, WaitOnBusyEngine);
+
+ //! Check if compiler expects read/write
+ if ((mem != nullptr) && !desc.info_.readOnly_) {
+ mem->signalWrite(&dev());
+ }
+ addVmMemory(gpuMem);
}
}
}