From 91a25df04fbb53efc17551b7b974a574b45dad2f Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Thu, 23 Jul 2020 18:17:26 -0400 Subject: [PATCH] Process cache coherency before mem dependency tracker Optimizaiton to remove extra syncs uncovered a bug with the cache coherency layer, there runtime could lose the track of mem address if coherency layer performed a sync. Change-Id: I25647cfa4a4be9cdbd8577ff076a740bbdac79c8 --- rocclr/device/rocm/rocvirtual.cpp | 33 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 42e3352c61..09f1fea156 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -211,8 +211,27 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para setAqlHeader(dispatchPacketHeaderNoSync_); } - // Mark the tracker with a new kernel, - // so we can avoid checks of the aliased objects + amd::Memory* const* memories = + reinterpret_cast(params + kernelParams.memoryObjOffset()); + + // HIP shouldn't use cache coherency layer at any time + if (!amd::IS_HIP) { + // Process cache coherency first, since the extra transfers may affect + // other mem dependency tracking logic: TS and signalWrite() + for (uint i = 0; i < signature.numMemories(); ++i) { + amd::Memory* mem = memories[i]; + if (mem != nullptr) { + roc::Memory* gpuMem = dev().getGpuMemory(mem); + // Don't sync for internal objects, since they are not shared between devices + if (gpuMem->owner()->getVirtualDevice() == nullptr) { + // Synchronize data with other memory instances if necessary + gpuMem->syncCacheFromHost(*this); + } + } + } + } + + // Mark the tracker with a new kernel, so it can avoid checks of the aliased objects memoryDependency().newKernel(); bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); @@ -268,9 +287,6 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para } } - amd::Memory* const* memories = - reinterpret_cast(params + kernelParams.memoryObjOffset()); - // Check all parameters for the current kernel for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); @@ -314,12 +330,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para } else { gpuMem = static_cast(mem->getDeviceMemory(dev())); - // Don't sync for internal objects, - // since they are not shared between devices - if (gpuMem->owner()->getVirtualDevice() == nullptr) { - // Synchronize data with other memory instances if necessary - gpuMem->syncCacheFromHost(*this); - } + const void* globalAddress = *reinterpret_cast(params + desc.offset_); ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index, desc.typeName_.c_str(), desc.name_.c_str(),