diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index cec860d7d2..0c0bf44400 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -2380,6 +2380,8 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { uint64_t markerOffset = markerAddr - surfAddr; cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), markerOffset, false); } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { + static constexpr bool FlushL2 = true; + flushCUCaches(FlushL2); cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), vcmd.markerOffset(), true); } eventEnd(MainEngine, gpuEvent); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index f80f730394..f7564b328f 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -2537,6 +2537,17 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { iCmd()->CmdWaitBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value, 0xFFFFFFFF, Pal::CompareFunc::GreaterEqual); } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { + // Make sure GPU finished operation and data reached memory before the marker write + static constexpr bool FlushL2 = true; + addBarrier(FlushL2); + // \todo: Implement the right changes in PAL + // Workarounds: for CP overfetch issues and the lack of SDMA sync + { + // Flush CB associated with the DGMA buffer + isDone(pGpuMemory->getGpuEvent(*this)); + // Make sure SDMA is done on the DGMA buffer + pGpuMemory->wait(*this, true); + } iCmd()->CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value); } eventEnd(MainEngine, gpuEvent);