From 05efcb5556ec3fd1216fedd3084ad4cf07d98edf Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 25 Jul 2019 11:38:30 -0400
Subject: [PATCH] P4 to Git Change 1973670 by gandryey@gera-win10 on 2019/07/25
11:27:11
SWDEV-79445 - OCL generic changes and code clean-up
- Don't create an extra queue for DMA transfers when SDMA is disabled. That should allow to avoid useless sync operations.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#75 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#143 edit
[ROCm/clr commit: b9b798616f94333ff7b1e7310d7c87b8ba6da4a2]
---
.../clr/rocclr/runtime/device/pal/palconstbuf.cpp | 6 ++++--
.../clr/rocclr/runtime/device/pal/palgpuopen.cpp | 7 ++++---
.../clr/rocclr/runtime/device/pal/palresource.cpp | 12 +++++++++++-
.../rocclr/runtime/device/pal/paltimestamp.cpp | 13 ++++++++-----
.../rocclr/runtime/device/pal/paltimestamp.hpp | 5 ++---
.../clr/rocclr/runtime/device/pal/palvirtual.cpp | 15 +++++----------
6 files changed, 34 insertions(+), 24 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
index 3bf5be1fd0..31cf628ebd 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
@@ -69,8 +69,10 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
// Get the next buffer in the list
++activeBuffer_;
activeBuffer_ %= MaxNumberOfBuffers;
- // Make sure the buffer isn't busy
- gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
+ if (!gpu().dev().settings().disableSdma_) {
+ // Make sure the buffer isn't busy
+ gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
+ }
gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]);
wrtAddress_ = pool_[activeBuffer_].buf->data();
wrtOffset_ = 0;
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
index 21af7690df..f548d40693 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
@@ -582,10 +582,11 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
// end the session itself
if (result == Pal::Result::Success) {
assert(trace_.gpa_session_ != nullptr);
+ EngineType engine = (gpu->dev().settings().disableSdma_) ? MainEngine : SdmaEngine;
// Initiate SDMA copy
- gpu->eventBegin(SdmaEngine);
- result = trace_.gpa_session_->End(gpu->queue(SdmaEngine).iCmd());
- gpu->eventEnd(SdmaEngine, trace_.end_event_);
+ gpu->eventBegin(engine);
+ result = trace_.gpa_session_->End(gpu->queue(engine).iCmd());
+ gpu->eventEnd(engine, trace_.end_event_);
}
// Submit the trace-end command buffer
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index b81fd90b18..a6136597c0 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -1332,7 +1332,12 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
}
- gpu.engineID_ = SdmaEngine;
+ if (dev().settings().disableSdma_) {
+ // Make sure compute is done before CP DMA start
+ gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
+ } else {
+ gpu.engineID_ = SdmaEngine;
+ }
// Wait for the resources, since runtime may use async transfers
wait(gpu, waitOnBusyEngine);
@@ -1423,6 +1428,11 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
}
}
+ if (dev().settings().disableSdma_) {
+ // Make sure CP dma is done
+ gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
+ }
+
gpu.eventEnd(gpu.engineID_, event);
// Mark source and destination as busy
diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp
index 45894e3df3..ceda586685 100644
--- a/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp
@@ -17,7 +17,7 @@ TimeStamp::TimeStamp(const VirtualGPU& gpu, Pal::IGpuMemory* iMem, uint memOffse
TimeStamp::~TimeStamp() {}
-void TimeStamp::begin(bool sdma) {
+void TimeStamp::begin() {
if (!flags_.beginIssued_) {
gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
memOffset_ + CommandStartTime * sizeof(uint64_t));
@@ -25,12 +25,11 @@ void TimeStamp::begin(bool sdma) {
}
}
-void TimeStamp::end(bool sdma) {
+void TimeStamp::end() {
CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!");
gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
memOffset_ + CommandEndTime * sizeof(uint64_t));
flags_.endIssued_ = true;
- flags_.sdma_ = sdma;
}
inline void SetValue(uint64_t* time, uint64_t val, double nanos) {
@@ -57,7 +56,9 @@ TimeStampCache::~TimeStampCache() {
for (uint i = 0; i < tsBuf_.size(); ++i) {
tsBuf_[i]->unmap(&gpu_);
gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem());
- gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
+ if (!gpu_.dev().settings().disableSdma_) {
+ gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
+ }
delete tsBuf_[i];
}
tsBuf_.clear();
@@ -77,7 +78,9 @@ TimeStamp* TimeStampCache::allocTimeStamp() {
return nullptr;
}
gpu_.queue(MainEngine).addMemRef(buf->iMem());
- gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
+ if (!gpu_.dev().settings().disableSdma_) {
+ gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
+ }
tsBufCpu_ = reinterpret_cast(buf->map(&gpu_));
memset(tsBufCpu_, 0, TimerBufSize);
tsOffset_ = 0;
diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
index 9691fa71a2..d88431bcb8 100644
--- a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
@@ -29,7 +29,6 @@ class TimeStamp : public amd::HeapObject {
struct {
uint32_t beginIssued_ : 1;
uint32_t endIssued_ : 1;
- uint32_t sdma_ : 1;
};
uint32_t value_;
Flags() : value_(0) {}
@@ -46,10 +45,10 @@ class TimeStamp : public amd::HeapObject {
~TimeStamp();
//! Starts the timestamp
- void begin(bool sdma = false);
+ void begin();
//! Ends the timestamp
- void end(bool sdma = false);
+ void end();
//! Returns the timestamp result in nano seconds
void value(uint64_t* startTime, uint64_t* endTime);
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 4e2408b4e5..5563f260fe 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -838,13 +838,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
if (nullptr == queues_[SdmaEngine]) {
return false;
}
- } else {
- queues_[SdmaEngine] =
- Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
- amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
- if (nullptr == queues_[SdmaEngine]) {
- return false;
- }
}
} else {
LogError("Runtme couldn't find compute queues!");
@@ -2484,7 +2477,9 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
queues_[MainEngine]->removeCmdMemRef(mem);
- queues_[SdmaEngine]->removeCmdMemRef(mem);
+ if (!dev().settings().disableSdma_) {
+ queues_[SdmaEngine]->removeCmdMemRef(mem);
+ }
}
void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
@@ -3128,9 +3123,9 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
return;
}
if (type) {
- profileTs_->begin((engine == SdmaEngine) ? true : false);
+ profileTs_->begin();
} else {
- profileTs_->end((engine == SdmaEngine) ? true : false);
+ profileTs_->end();
}
}