From e8c7cf569fa67b6bf66e7e5e71ae4ec11335a911 Mon Sep 17 00:00:00 2001
From: Aryan Salmanpour <aryan.salmanpour@amd.com>
Date: Thu, 29 Oct 2020 17:32:15 -0400
Subject: [PATCH] Add an environment variable for setting a global CU mask

Change-Id: I773b152023c7b8e1e679a42015748f9b23fd946d


[ROCm/clr commit: d03ee6eff6c8d45a90ea7a5565c1223d160a0335]
---
 projects/clr/rocclr/device/device.hpp         |   3 +
 projects/clr/rocclr/device/rocm/rocdevice.cpp | 113 ++++++++++++++++--
 projects/clr/rocclr/device/rocm/rocdevice.hpp |   2 +
 projects/clr/rocclr/utils/flags.hpp           |   5 +-
 4 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 973ad458df..7c125e6531 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -566,6 +566,9 @@ struct Info : public amd::EmbeddedObject {
   uint32_t  hmmSupported_;            //!< ROCr supports HMM interfaces
   uint32_t  hmmCpuMemoryAccessible_;  //!< CPU memory is accessible by GPU without pinning/register
   uint32_t  hmmDirectHostAccess_;     //!< HMM memory is accessible from the host without migration
+
+  //! global CU mask which will be applied to all queues created on this device
+  std::vector<uint32_t> globalCUMask_;
 };
 
 //! Device settings
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 1877dae238..2fb11daf8d 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -544,6 +544,11 @@ bool Device::init() {
       }
     }
 
+    // Check to see if a global CU mask is requested
+    if (amd::IS_HIP && ROC_GLOBAL_CU_MASK[0] != '\0') {
+      roc_device->getGlobalCUMask(ROC_GLOBAL_CU_MASK);
+    }
+
     roc_device.release()->registerDevice();
   }
 
@@ -1501,6 +1506,8 @@ bool Device::populateOCLDeviceConstants() {
   }
 #endif  // AMD_HMM_SUPPORT
 
+  info_.globalCUMask_ = {};
+
   return true;
 }
 
@@ -2421,30 +2428,61 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
       " cooperative: %i", queue, queue_size, queue_priority, coop_queue);
 
   hsa_amd_profiling_set_profiler_enabled(queue, 1);
-  if (cuMask.size() != 0) {
+  if (cuMask.size() != 0 || info_.globalCUMask_.size() != 0) {
     std::stringstream ss;
     ss << std::hex;
-    for (int i = cuMask.size() - 1; i >= 0; i--) {
-      ss << cuMask[i];
+    std::vector<uint32_t> mask = {};
+
+    // handle scenarios where cuMask (custom-defined), globalCUMask_ or both are valid and
+    // fill the final mask which will be appiled to the current queue
+    if (cuMask.size() != 0 && info_.globalCUMask_.size() == 0) {
+      mask = cuMask;
+    } else if (cuMask.size() != 0 && info_.globalCUMask_.size() != 0) {
+      for (unsigned int i = 0; i < std::min(cuMask.size(), info_.globalCUMask_.size()); i++) {
+        mask.push_back(cuMask[i] & info_.globalCUMask_[i]);
+      }
+      // check to make sure after ANDing cuMask (custom-defined) with global
+      //CU mask, we have non-zero mask, oterwise just apply global CU mask
+      bool zeroCUMask = true;
+      for (auto m : mask) {
+        if (m != 0) {
+          zeroCUMask = false;
+          break;
+        }
+      }
+      if (zeroCUMask) {
+        mask = info_.globalCUMask_;
+      }
+    } else {
+      mask = info_.globalCUMask_;
     }
-    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "setting custom CU mask 0x%s for hardware queue %p",
+
+
+    for (int i = mask.size() - 1; i >= 0; i--) {
+      ss << mask[i];
+    }
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "setting CU mask 0x%s for hardware queue %p",
             ss.str().c_str(), queue);
+
     hsa_status_t status = HSA_STATUS_SUCCESS;
-    status = hsa_amd_queue_cu_set_mask(queue, cuMask.size() * 32, cuMask.data());
+    status = hsa_amd_queue_cu_set_mask(queue, mask.size() * 32, mask.data());
     if (status != HSA_STATUS_SUCCESS) {
       DevLogError("Device::acquireQueue: hsa_amd_queue_cu_set_mask failed!");
       hsa_queue_destroy(queue);
       return nullptr;
     }
-    // add queues with custom CU mask into their special pool to keep track
-    // of mapping of these queues to their associated queueInfo (i.e., hostcall buffers)
-    auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
-    assert(result.second && "QueueInfo already exists");
-    auto &qInfo = result.first->second;
-    qInfo.refCount = 1;
+    if (cuMask.size() != 0) {
+      // add queues with custom CU mask into their special pool to keep track
+      // of mapping of these queues to their associated queueInfo (i.e., hostcall buffers)
+      auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
+      assert(result.second && "QueueInfo already exists");
+      auto& qInfo = result.first->second;
+      qInfo.refCount = 1;
 
-    return queue;
+      return queue;
+    }
   }
+
   if (coop_queue) {
     // Skip queue recycling for cooperative queues, since it should be just one
     // per device.
@@ -2660,5 +2698,56 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
   return true;
 }
 
+void Device::getGlobalCUMask(std::string cuMaskStr) {
+  if (cuMaskStr.length() != 0) {
+    std::string pre = cuMaskStr.substr(0, 2);
+    if (pre.compare("0x") == 0 || pre.compare("0X") == 0) {
+      cuMaskStr = cuMaskStr.substr(2, cuMaskStr.length());
+    }
+
+    int end = cuMaskStr.length();
+
+    // the number of current physical CUs compressed in 4-bits
+    size_t compPhysicalCUs = static_cast<size_t>((settings().enableWgpMode_ ?
+           info_.maxComputeUnits_ * 2 : info_.maxComputeUnits_)/ 4);
+
+    // the number of final available compute units after applying the requested CU mask
+    uint32_t availCUs = 0;
+
+    // read numCharToRead characters (8 or less) from the cuMask string each time, convert
+    // it into hex, and store it into the globalCUMask_. If the length of the cuMask string
+    // is more than the compressed physical available CUs, ignore the rest
+    for (unsigned i = 0; i < std::min(cuMaskStr.length(), compPhysicalCUs); i += 8) {
+      int numCharToRead = (i + 8 <= compPhysicalCUs) ? 8 : compPhysicalCUs - 8;
+      std::string temp = cuMaskStr.substr(std::max(0, end - numCharToRead),
+          std::min(numCharToRead, end));
+      end -= numCharToRead;
+      unsigned long ul = 0;
+      try {
+        ul = std::stoul(temp, 0, 16);
+      } catch (const std::invalid_argument&) {
+        info_.globalCUMask_ = {};
+        break;
+      }
+      info_.globalCUMask_.push_back(static_cast<uint32_t>(ul));
+      // count number of set bits in ul to find the number of active CUs
+      // in each iteration
+      while (ul) {
+        ul &= (ul - 1);
+        availCUs++;
+      }
+    }
+    //update the maxComputeUnits_ based on the requested CU mask
+    if (availCUs != 0 && availCUs < compPhysicalCUs * 4) {
+      info_.maxComputeUnits_ = settings().enableWgpMode_ ?
+      availCUs / 2 : availCUs;
+    } else {
+      info_.globalCUMask_ = {};
+    }
+  } else {
+    info_.globalCUMask_ = {};
+  }
+}
+
 } // namespace roc
 #endif  // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 7fc3ebb0f3..39ffcfd84e 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -498,6 +498,8 @@ class Device : public NullDevice {
   //! Initialize memory in AMD HMM on the current device or keeps it in the host memory
   bool SvmAllocInit(void* memory, size_t size) const;
 
+  void getGlobalCUMask(std::string cuMaskStr);
+
  private:
   bool SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
                            bool first_alloc = false, bool use_cpu = false) const;
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 53852b2d98..ed9828778d 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -249,7 +249,10 @@ release(bool, HIP_FORCE_QUEUE_PROFILING, false,                               \
 release(uint, PAL_FORCE_ASIC_REVISION, 0,                                     \
         "Force a specific asic revision for all devices")                     \
 release(bool, PAL_EMBED_KERNEL_MD, false,                                     \
-        "Enables writing kernel metadata into command buffers.")
+        "Enables writing kernel metadata into command buffers.")              \
+release(cstring, ROC_GLOBAL_CU_MASK, "",                                      \
+        "Sets a global CU mask (entered as hex value) for all queues,"        \
+        "Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)")
 
 namespace amd {