Process cache coherency before mem dependency tracker

Optimizaiton to remove extra syncs uncovered a bug with the cache
coherency layer, there runtime could lose the track of mem address
if coherency layer performed a sync.

Change-Id: I25647cfa4a4be9cdbd8577ff076a740bbdac79c8
Tento commit je obsažen v:
German Andryeyev
2020-07-23 18:17:26 -04:00
rodič 9d0b0c32a9
revize 91a25df04f
+22 -11
Zobrazit soubor
@@ -211,8 +211,27 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
setAqlHeader(dispatchPacketHeaderNoSync_);
}
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
// HIP shouldn't use cache coherency layer at any time
if (!amd::IS_HIP) {
// Process cache coherency first, since the extra transfers may affect
// other mem dependency tracking logic: TS and signalWrite()
for (uint i = 0; i < signature.numMemories(); ++i) {
amd::Memory* mem = memories[i];
if (mem != nullptr) {
roc::Memory* gpuMem = dev().getGpuMemory(mem);
// Don't sync for internal objects, since they are not shared between devices
if (gpuMem->owner()->getVirtualDevice() == nullptr) {
// Synchronize data with other memory instances if necessary
gpuMem->syncCacheFromHost(*this);
}
}
}
}
// Mark the tracker with a new kernel, so it can avoid checks of the aliased objects
memoryDependency().newKernel();
bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true);
@@ -268,9 +287,6 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
}
}
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
@@ -314,12 +330,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
}
else {
gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
// Don't sync for internal objects,
// since they are not shared between devices
if (gpuMem->owner()->getVirtualDevice() == nullptr) {
// Synchronize data with other memory instances if necessary
gpuMem->syncCacheFromHost(*this);
}
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
desc.typeName_.c_str(), desc.name_.c_str(),