From 05efcb5556ec3fd1216fedd3084ad4cf07d98edf Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 25 Jul 2019 11:38:30 -0400 Subject: [PATCH] P4 to Git Change 1973670 by gandryey@gera-win10 on 2019/07/25 11:27:11 SWDEV-79445 - OCL generic changes and code clean-up - Don't create an extra queue for DMA transfers when SDMA is disabled. That should allow to avoid useless sync operations. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#143 edit [ROCm/clr commit: b9b798616f94333ff7b1e7310d7c87b8ba6da4a2] --- .../clr/rocclr/runtime/device/pal/palconstbuf.cpp | 6 ++++-- .../clr/rocclr/runtime/device/pal/palgpuopen.cpp | 7 ++++--- .../clr/rocclr/runtime/device/pal/palresource.cpp | 12 +++++++++++- .../rocclr/runtime/device/pal/paltimestamp.cpp | 13 ++++++++----- .../rocclr/runtime/device/pal/paltimestamp.hpp | 5 ++--- .../clr/rocclr/runtime/device/pal/palvirtual.cpp | 15 +++++---------- 6 files changed, 34 insertions(+), 24 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp index 3bf5be1fd0..31cf628ebd 100644 --- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp @@ -69,8 +69,10 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) { // Get the next buffer in the list ++activeBuffer_; activeBuffer_ %= MaxNumberOfBuffers; - // Make sure the buffer isn't busy - gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]); + if (!gpu().dev().settings().disableSdma_) { + // Make sure the buffer isn't busy + gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]); + } gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]); wrtAddress_ = pool_[activeBuffer_].buf->data(); wrtOffset_ = 0; diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp index 21af7690df..f548d40693 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp @@ -582,10 +582,11 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) { // end the session itself if (result == Pal::Result::Success) { assert(trace_.gpa_session_ != nullptr); + EngineType engine = (gpu->dev().settings().disableSdma_) ? MainEngine : SdmaEngine; // Initiate SDMA copy - gpu->eventBegin(SdmaEngine); - result = trace_.gpa_session_->End(gpu->queue(SdmaEngine).iCmd()); - gpu->eventEnd(SdmaEngine, trace_.end_event_); + gpu->eventBegin(engine); + result = trace_.gpa_session_->End(gpu->queue(engine).iCmd()); + gpu->eventEnd(engine, trace_.end_event_); } // Submit the trace-end command buffer diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index b81fd90b18..a6136597c0 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -1332,7 +1332,12 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, } } - gpu.engineID_ = SdmaEngine; + if (dev().settings().disableSdma_) { + // Make sure compute is done before CP DMA start + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); + } else { + gpu.engineID_ = SdmaEngine; + } // Wait for the resources, since runtime may use async transfers wait(gpu, waitOnBusyEngine); @@ -1423,6 +1428,11 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, } } + if (dev().settings().disableSdma_) { + // Make sure CP dma is done + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); + } + gpu.eventEnd(gpu.engineID_, event); // Mark source and destination as busy diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp index 45894e3df3..ceda586685 100644 --- a/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp @@ -17,7 +17,7 @@ TimeStamp::TimeStamp(const VirtualGPU& gpu, Pal::IGpuMemory* iMem, uint memOffse TimeStamp::~TimeStamp() {} -void TimeStamp::begin(bool sdma) { +void TimeStamp::begin() { if (!flags_.beginIssued_) { gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_, memOffset_ + CommandStartTime * sizeof(uint64_t)); @@ -25,12 +25,11 @@ void TimeStamp::begin(bool sdma) { } } -void TimeStamp::end(bool sdma) { +void TimeStamp::end() { CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!"); gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_, memOffset_ + CommandEndTime * sizeof(uint64_t)); flags_.endIssued_ = true; - flags_.sdma_ = sdma; } inline void SetValue(uint64_t* time, uint64_t val, double nanos) { @@ -57,7 +56,9 @@ TimeStampCache::~TimeStampCache() { for (uint i = 0; i < tsBuf_.size(); ++i) { tsBuf_[i]->unmap(&gpu_); gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem()); - gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem()); + if (!gpu_.dev().settings().disableSdma_) { + gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem()); + } delete tsBuf_[i]; } tsBuf_.clear(); @@ -77,7 +78,9 @@ TimeStamp* TimeStampCache::allocTimeStamp() { return nullptr; } gpu_.queue(MainEngine).addMemRef(buf->iMem()); - gpu_.queue(SdmaEngine).addMemRef(buf->iMem()); + if (!gpu_.dev().settings().disableSdma_) { + gpu_.queue(SdmaEngine).addMemRef(buf->iMem()); + } tsBufCpu_ = reinterpret_cast
(buf->map(&gpu_)); memset(tsBufCpu_, 0, TimerBufSize); tsOffset_ = 0; diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp index 9691fa71a2..d88431bcb8 100644 --- a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp @@ -29,7 +29,6 @@ class TimeStamp : public amd::HeapObject { struct { uint32_t beginIssued_ : 1; uint32_t endIssued_ : 1; - uint32_t sdma_ : 1; }; uint32_t value_; Flags() : value_(0) {} @@ -46,10 +45,10 @@ class TimeStamp : public amd::HeapObject { ~TimeStamp(); //! Starts the timestamp - void begin(bool sdma = false); + void begin(); //! Ends the timestamp - void end(bool sdma = false); + void end(); //! Returns the timestamp result in nano seconds void value(uint64_t* startTime, uint64_t* endTime); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 4e2408b4e5..5563f260fe 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -838,13 +838,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, if (nullptr == queues_[SdmaEngine]) { return false; } - } else { - queues_[SdmaEngine] = - Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs, - amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers); - if (nullptr == queues_[SdmaEngine]) { - return false; - } } } else { LogError("Runtme couldn't find compute queues!"); @@ -2484,7 +2477,9 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { void VirtualGPU::releaseMemory(GpuMemoryReference* mem) { queues_[MainEngine]->removeCmdMemRef(mem); - queues_[SdmaEngine]->removeCmdMemRef(mem); + if (!dev().settings().disableSdma_) { + queues_[SdmaEngine]->removeCmdMemRef(mem); + } } void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { @@ -3128,9 +3123,9 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const { return; } if (type) { - profileTs_->begin((engine == SdmaEngine) ? true : false); + profileTs_->begin(); } else { - profileTs_->end((engine == SdmaEngine) ? true : false); + profileTs_->end(); } }