From 65de22f0ed74b6b507eb96f908505ad84ef3b58f Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Fri, 13 Oct 2017 14:10:40 -0400
Subject: [PATCH] P4 to Git Change 1469850 by gandryey@gera-w8 on 2017/10/13
 13:56:50

	SWDEV-79445 - OCL generic changes and code clean-up
	- Remove obsolete/unused code

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#34 edit


[ROCm/clr commit: 59ed7d24454a746175f8e59e0f45fc7aad150b71]
---
 .../runtime/device/pal/paldebugmanager.cpp    |   2 +-
 .../rocclr/runtime/device/pal/palvirtual.cpp  | 106 +++---------------
 .../rocclr/runtime/device/pal/palvirtual.hpp  |  37 +-----
 3 files changed, 22 insertions(+), 123 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
index 7ed056645f..124de40991 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
@@ -143,7 +143,7 @@ void GpuDebugManager::unregisterDebugger() {
 
 void GpuDebugManager::flushCache(uint32_t mask) {
   HwDbgGpuCacheMask cacheMask(mask);
-  device()->xferQueue()->flushCuCaches(cacheMask);
+  //device()->xferQueue()->flushCuCaches(cacheMask);
 }
 
 
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 3eb5001971..bbf53a634a 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -707,13 +707,13 @@ VirtualGPU::VirtualGPU(Device& device)
       engineID_(MainEngine),
       gpuDevice_(static_cast<Device&>(device)),
       execution_("Virtual GPU execution lock", true),
-      printfDbg_(nullptr),
       printfDbgHSA_(nullptr),
       tsCache_(nullptr),
       dmaFlushMgmt_(device),
       hwRing_(0),
       readjustTimeGPU_(0),
-      currTs_(nullptr),
+      lastTS_(nullptr),
+      profileTs_(nullptr),
       vqHeader_(nullptr),
       virtualQueue_(nullptr),
       schedParams_(nullptr),
@@ -722,10 +722,6 @@ VirtualGPU::VirtualGPU(Device& device)
       maskGroups_(1),
       hsaQueueMem_(nullptr),
       cmdAllocator_(nullptr) {
-  memset(&cal_, 0, sizeof(CalVirtualDesc));
-  for (uint i = 0; i < AllEngines; ++i) {
-    cal_.events_[i].invalidate();
-  }
 
   // Note: Virtual GPU device creation must be a thread safe operation
   index_ = gpuDevice_.numOfVgpus_++;
@@ -829,14 +825,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
     return false;
   }
 
-  // Create Printf class
-  printfDbg_ = new PrintfDbg(gpuDevice_);
-  if ((nullptr == printfDbg_) || !printfDbg_->create()) {
-    delete printfDbg_;
-    LogError("Could not allocate debug buffer for printf()!");
-    return false;
-  }
-
   // Create HSAILPrintf class
   printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
   if (nullptr == printfDbgHSA_) {
@@ -930,9 +918,6 @@ VirtualGPU::~VirtualGPU() {
     freeCbQueue_.pop();
   }
 
-  // Destroy printf object
-  delete printfDbg_;
-
   // Destroy printfHSA object
   delete printfDbgHSA_;
 
@@ -1833,53 +1818,6 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
   profilingEnd(vcmd);
 }
 
-void VirtualGPU::findIterations(const amd::NDRangeContainer& sizes, const amd::NDRange& local,
-                                amd::NDRange& groups, amd::NDRange& remainder, size_t& extra) {
-  size_t dimensions = sizes.dimensions();
-
-  if (cal()->iterations_ > 1) {
-    size_t iterations = cal()->iterations_;
-    cal_.iterations_ = 1;
-
-    // Find the total amount of all groups
-    groups = sizes.global() / local;
-    if (dev().settings().partialDispatch_) {
-      for (uint j = 0; j < dimensions; ++j) {
-        if ((sizes.global()[j] % local[j]) != 0) {
-          groups[j]++;
-        }
-      }
-    }
-
-    // Calculate the real number of required iterations and
-    // the workgroup size of each iteration
-    for (int j = (dimensions - 1); j >= 0; --j) {
-      // Find possible size of each iteration
-      size_t tmp = (groups[j] / iterations);
-      // Make sure the group size is more than 1
-      if (tmp > 0) {
-        remainder = groups;
-        remainder[j] = (groups[j] % tmp);
-
-        extra = ((groups[j] / tmp) +
-                 // Check for the remainder
-                 ((remainder[j] != 0) ? 1 : 0));
-        // Recalculate the number of iterations
-        cal_.iterations_ *= extra;
-        if (remainder[j] == 0) {
-          extra = 0;
-        }
-        groups[j] = tmp;
-        break;
-      } else {
-        iterations = ((iterations / groups[j]) + (((iterations % groups[j]) != 0) ? 1 : 0));
-        cal_.iterations_ *= groups[j];
-        groups[j] = 1;
-      }
-    }
-  }
-}
-
 void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
@@ -2651,7 +2589,7 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
   bool gpuCommand = false;
 
   for (uint i = 0; i < AllEngines; ++i) {
-    if (cal_.events_[i].isValid()) {
+    if (events_[i].isValid()) {
       gpuCommand = true;
     }
   }
@@ -2668,10 +2606,10 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
     }
 
     if (nullptr == cb) {
-      cb = new CommandBatch(list, cal()->events_, cal()->lastTS_);
+      cb = new CommandBatch(list, events_, lastTS_);
     } else {
       freeCbQueue_.pop();
-      cb->init(list, cal()->events_, cal()->lastTS_);
+      cb->init(list, events_, lastTS_);
     }
   }
 
@@ -2684,12 +2622,12 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
       // if runtime didn't submit any commands
       //! @note: it's safe to invalidate events, since
       //! we already saved them with the batch creation step above
-      cal_.events_[i].invalidate();
+      events_[i].invalidate();
     }
   }
 
   // Mark last TS as nullptr, so runtime won't process empty batches with the old TS
-  cal_.lastTS_ = nullptr;
+  lastTS_ = nullptr;
   if (nullptr != cb) {
     cbQueue_.push(cb);
   }
@@ -2721,7 +2659,7 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
 void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); }
 
 void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) {
-  cal_.events_[engineID_] = gpuEvent;
+  events_[engineID_] = gpuEvent;
 
   // Flush current DMA buffer if requested
   if (flush) {
@@ -2738,7 +2676,7 @@ void VirtualGPU::flushDMA(uint engineID) {
     //! but L1 still has to be invalidated.
   }
 
-  isDone(&cal_.events_[engineID]);
+  isDone(&events_[engineID]);
 }
 
 bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
@@ -2747,7 +2685,7 @@ bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
 
   // If command batch is nullptr then wait for the current
   if (nullptr == cb) {
-    events = cal_.events_;
+    events = events_;
   } else {
     events = cb->events_;
   }
@@ -2844,7 +2782,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
     }
     // Save the TimeStamp object in the current OCL event
     command.setData(ts);
-    currTs_ = ts;
+    profileTs_ = ts;
     state_.profileEnabled_ = true;
   }
 }
@@ -2855,7 +2793,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
   if (ts != nullptr) {
     // Check if the command actually did any GPU submission
     if (ts->isValid()) {
-      cal_.lastTS_ = ts;
+      lastTS_ = ts;
     } else {
       // Destroy the TimeStamp object
       tsCache_->freeTimeStamp(ts);
@@ -2949,13 +2887,13 @@ void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDop
 }
 
 void VirtualGPU::profileEvent(EngineType engine, bool type) const {
-  if (nullptr == currTs_) {
+  if (nullptr == profileTs_) {
     return;
   }
   if (type) {
-    currTs_->begin((engine == SdmaEngine) ? true : false);
+    profileTs_->begin((engine == SdmaEngine) ? true : false);
   } else {
-    currTs_->end((engine == SdmaEngine) ? true : false);
+    profileTs_->end((engine == SdmaEngine) ? true : false);
   }
 }
 
@@ -3105,20 +3043,6 @@ void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) {
   virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
 }
 
-void VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) {
-  Unimplemented();
-  /*
-      //! @todo:  fix issue of no event available for the flush/invalidate cache command
-      InvalidateSqCaches(cache_mask.sqICache_,
-                         cache_mask.sqKCache_,
-                         cache_mask.tcL1_,
-                         cache_mask.tcL2_);
-  */
-  flushDMA(engineID_);
-
-  return;
-}
-
 void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt,
                                  HwDbgKernelInfo& kernelInfo, amd::Event* enqueueEvent) {
   amd::HwDebugManager* dbgManager = dev().hwDebugMgr();
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 6b0d2dcd54..2318dcd729 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -205,13 +205,6 @@ class VirtualGPU : public device::VirtualDevice {
     State() : value_(0) {}
   };
 
-  //! CAL descriptor for the GPU virtual device
-  struct CalVirtualDesc : public amd::EmbeddedObject {
-    GpuEvent events_[AllEngines];  //!< Last known GPU events
-    uint iterations_;              //!< Number of iterations for the execution
-    TimeStamp* lastTS_;            //!< Last timestamp executed on Virtual GPU
-  };
-
   typedef std::vector<ConstBuffer*> constbufs_t;
 
   class MemoryDependency : public amd::EmbeddedObject {
@@ -327,9 +320,6 @@ class VirtualGPU : public device::VirtualDevice {
   //! Returns GPU device object associated with this kernel
   const Device& dev() const { return gpuDevice_; }
 
-  //! Returns CAL descriptor of the virtual device
-  const CalVirtualDesc* cal() const { return &cal_; }
-
   //! Set the last known GPU event
   void setGpuEvent(GpuEvent gpuEvent,  //!< GPU event for tracking
                    bool flush = false  //!< TRUE if flush is required
@@ -401,9 +391,6 @@ class VirtualGPU : public device::VirtualDevice {
   //! Returns the virtual gpu unique index
   uint index() const { return index_; }
 
-  //! Get the PrintfDbg object
-  PrintfDbg& printfDbg() const { return *printfDbg_; }
-
   //! Get the PrintfDbgHSA object
   PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
 
@@ -425,9 +412,6 @@ class VirtualGPU : public device::VirtualDevice {
   //! Returns the HW ring used on this virtual device
   uint hwRing() const { return hwRing_; }
 
-  //! Returns current timestamp object for profiling
-  TimeStamp* currTs() const { return cal_.lastTS_; }
-
   //! Returns virtual queue object for device enqueuing
   Memory* vQueue() const { return virtualQueue_; }
 
@@ -439,10 +423,6 @@ class VirtualGPU : public device::VirtualDevice {
                           );
 
   EngineType engineID_;  //!< Engine ID for this VirtualGPU
-  State state_;          //!< virtual GPU current state
-  CalVirtualDesc cal_;   //!< CAL virtual device descriptor
-
-  void flushCuCaches(HwDbgGpuCacheMask cache_mask);  //!< flush/invalidate SQ cache
 
   //! Returns PAL command buffer interface
   Pal::ICmdBuffer* iCmd() const {
@@ -530,14 +510,6 @@ class VirtualGPU : public device::VirtualDevice {
     MemoryRange() : start_(0), end_(0) {}
   };
 
-  //! Finds total amount of necessary iterations
-  inline void findIterations(const amd::NDRangeContainer& sizes,  //!< Original workload sizes
-                             const amd::NDRange& local,           //!< Local workgroup size
-                             amd::NDRange& groups,                //!< Calculated workgroup sizes
-                             amd::NDRange& remainder,             //!< Calculated remainder sizes
-                             size_t& extra  //!< Amount of extra executions for remainder
-                             );
-
   //! Allocates constant buffers
   bool allocConstantBuffers();
 
@@ -592,7 +564,6 @@ class VirtualGPU : public device::VirtualDevice {
   amd::Monitor execution_;  //!< Lock to serialise access to all device objects
   uint index_;              //!< The virtual device unique index
 
-  PrintfDbg* printfDbg_;        //!< GPU printf implemenation
   PrintfDbgHSA* printfDbgHSA_;  //!< HSAIL printf implemenation
 
   TimeStampCache* tsCache_;            //!< TimeStamp cache
@@ -609,8 +580,12 @@ class VirtualGPU : public device::VirtualDevice {
 
   uint hwRing_;  //!< HW ring used on this virtual device
 
-  uint64_t readjustTimeGPU_;  //!< Readjust time between GPU and CPU timestamps
-  TimeStamp* currTs_;         //!< current timestamp for command
+  State state_;          //!< virtual GPU current state
+  GpuEvent events_[AllEngines];  //!< Last known GPU events
+
+  uint64_t readjustTimeGPU_;   //!< Readjust time between GPU and CPU timestamps
+  TimeStamp* lastTS_;          //!< Last timestamp executed on Virtual GPU
+  TimeStamp* profileTs_;       //!< current profiling timestamp for command
 
   AmdVQueueHeader* vqHeader_;  //!< Sysmem copy for virtual queue header
   Memory* virtualQueue_;       //!< Virtual device queue