P4 to Git Change 1973670 by gandryey@gera-win10 on 2019/07/25 11:27:11

SWDEV-79445 - OCL generic changes and code clean-up - Don't create an extra queue for DMA transfers when SDMA is disabled. That should allow to avoid useless sync operations. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#75 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#143 edit [ROCm/clr commit: b9b798616f]
2019-07-25 11:38:30 -04:00
parent 81b9a53440
commit 05efcb5556
6 changed files with 34 additions and 24 deletions
@@ -69,8 +69,10 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
    // Get the next buffer in the list
    ++activeBuffer_;
    activeBuffer_ %= MaxNumberOfBuffers;
-    // Make sure the buffer isn't busy
-    gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
+    if (!gpu().dev().settings().disableSdma_) {
+      // Make sure the buffer isn't busy
+      gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
+    }
    gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]);
    wrtAddress_ = pool_[activeBuffer_].buf->data();
    wrtOffset_ = 0;
@@ -582,10 +582,11 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
  // end the session itself
  if (result == Pal::Result::Success) {
    assert(trace_.gpa_session_ != nullptr);
+    EngineType engine = (gpu->dev().settings().disableSdma_) ? MainEngine : SdmaEngine;
    // Initiate SDMA copy
-    gpu->eventBegin(SdmaEngine);
-    result = trace_.gpa_session_->End(gpu->queue(SdmaEngine).iCmd());
-    gpu->eventEnd(SdmaEngine, trace_.end_event_);
+    gpu->eventBegin(engine);
+    result = trace_.gpa_session_->End(gpu->queue(engine).iCmd());
+    gpu->eventEnd(engine, trace_.end_event_);
  }

  // Submit the trace-end command buffer
@@ -1332,7 +1332,12 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
    }
  }

-  gpu.engineID_ = SdmaEngine;
+  if (dev().settings().disableSdma_) {
+    // Make sure compute is done before CP DMA start
+    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
+  } else {
+    gpu.engineID_ = SdmaEngine;
+  }

  // Wait for the resources, since runtime may use async transfers
  wait(gpu, waitOnBusyEngine);
@@ -1423,6 +1428,11 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
    }
  }

+  if (dev().settings().disableSdma_) {
+    // Make sure CP dma is done
+    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
+  }
+
  gpu.eventEnd(gpu.engineID_, event);

  // Mark source and destination as busy
@@ -17,7 +17,7 @@ TimeStamp::TimeStamp(const VirtualGPU& gpu, Pal::IGpuMemory* iMem, uint memOffse

 TimeStamp::~TimeStamp() {}

-void TimeStamp::begin(bool sdma) {
+void TimeStamp::begin() {
  if (!flags_.beginIssued_) {
    gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
                                    memOffset_ + CommandStartTime * sizeof(uint64_t));
@@ -25,12 +25,11 @@ void TimeStamp::begin(bool sdma) {
  }
 }

-void TimeStamp::end(bool sdma) {
+void TimeStamp::end() {
  CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!");
  gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
                                  memOffset_ + CommandEndTime * sizeof(uint64_t));
  flags_.endIssued_ = true;
-  flags_.sdma_ = sdma;
 }

 inline void SetValue(uint64_t* time, uint64_t val, double nanos) {
@@ -57,7 +56,9 @@ TimeStampCache::~TimeStampCache() {
  for (uint i = 0; i < tsBuf_.size(); ++i) {
    tsBuf_[i]->unmap(&gpu_);
    gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem());
-    gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
+    if (!gpu_.dev().settings().disableSdma_) {
+      gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
+    }
    delete tsBuf_[i];
  }
  tsBuf_.clear();
@@ -77,7 +78,9 @@ TimeStamp* TimeStampCache::allocTimeStamp() {
        return nullptr;
      }
      gpu_.queue(MainEngine).addMemRef(buf->iMem());
-      gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
+      if (!gpu_.dev().settings().disableSdma_) {
+        gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
+      }
      tsBufCpu_ = reinterpret_cast<address>(buf->map(&gpu_));
      memset(tsBufCpu_, 0, TimerBufSize);
      tsOffset_ = 0;
@@ -29,7 +29,6 @@ class TimeStamp : public amd::HeapObject {
    struct {
      uint32_t beginIssued_ : 1;
      uint32_t endIssued_ : 1;
-      uint32_t sdma_ : 1;
    };
    uint32_t value_;
    Flags() : value_(0) {}
@@ -46,10 +45,10 @@ class TimeStamp : public amd::HeapObject {
  ~TimeStamp();

  //! Starts the timestamp
-  void begin(bool sdma = false);
+  void begin();

  //! Ends the timestamp
-  void end(bool sdma = false);
+  void end();

  //! Returns the timestamp result in nano seconds
  void value(uint64_t* startTime, uint64_t* endTime);
@@ -838,13 +838,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
-    } else {
-      queues_[SdmaEngine] =
-          Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
-                        amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
-      if (nullptr == queues_[SdmaEngine]) {
-        return false;
-      }
    }
  } else {
    LogError("Runtme couldn't find compute queues!");
@@ -2484,7 +2477,9 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {

 void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
  queues_[MainEngine]->removeCmdMemRef(mem);
-  queues_[SdmaEngine]->removeCmdMemRef(mem);
+  if (!dev().settings().disableSdma_) {
+    queues_[SdmaEngine]->removeCmdMemRef(mem);
+  }
 }

 void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
@@ -3128,9 +3123,9 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
    return;
  }
  if (type) {
-    profileTs_->begin((engine == SdmaEngine) ? true : false);
+    profileTs_->begin();
  } else {
-    profileTs_->end((engine == SdmaEngine) ? true : false);
+    profileTs_->end();
  }
 }