From e8c7cf569fa67b6bf66e7e5e71ae4ec11335a911 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 29 Oct 2020 17:32:15 -0400 Subject: [PATCH] Add an environment variable for setting a global CU mask Change-Id: I773b152023c7b8e1e679a42015748f9b23fd946d [ROCm/clr commit: d03ee6eff6c8d45a90ea7a5565c1223d160a0335] --- projects/clr/rocclr/device/device.hpp | 3 + projects/clr/rocclr/device/rocm/rocdevice.cpp | 113 ++++++++++++++++-- projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 + projects/clr/rocclr/utils/flags.hpp | 5 +- 4 files changed, 110 insertions(+), 13 deletions(-) diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 973ad458df..7c125e6531 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -566,6 +566,9 @@ struct Info : public amd::EmbeddedObject { uint32_t hmmSupported_; //!< ROCr supports HMM interfaces uint32_t hmmCpuMemoryAccessible_; //!< CPU memory is accessible by GPU without pinning/register uint32_t hmmDirectHostAccess_; //!< HMM memory is accessible from the host without migration + + //! global CU mask which will be applied to all queues created on this device + std::vector globalCUMask_; }; //! Device settings diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 1877dae238..2fb11daf8d 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -544,6 +544,11 @@ bool Device::init() { } } + // Check to see if a global CU mask is requested + if (amd::IS_HIP && ROC_GLOBAL_CU_MASK[0] != '\0') { + roc_device->getGlobalCUMask(ROC_GLOBAL_CU_MASK); + } + roc_device.release()->registerDevice(); } @@ -1501,6 +1506,8 @@ bool Device::populateOCLDeviceConstants() { } #endif // AMD_HMM_SUPPORT + info_.globalCUMask_ = {}; + return true; } @@ -2421,30 +2428,61 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, " cooperative: %i", queue, queue_size, queue_priority, coop_queue); hsa_amd_profiling_set_profiler_enabled(queue, 1); - if (cuMask.size() != 0) { + if (cuMask.size() != 0 || info_.globalCUMask_.size() != 0) { std::stringstream ss; ss << std::hex; - for (int i = cuMask.size() - 1; i >= 0; i--) { - ss << cuMask[i]; + std::vector mask = {}; + + // handle scenarios where cuMask (custom-defined), globalCUMask_ or both are valid and + // fill the final mask which will be appiled to the current queue + if (cuMask.size() != 0 && info_.globalCUMask_.size() == 0) { + mask = cuMask; + } else if (cuMask.size() != 0 && info_.globalCUMask_.size() != 0) { + for (unsigned int i = 0; i < std::min(cuMask.size(), info_.globalCUMask_.size()); i++) { + mask.push_back(cuMask[i] & info_.globalCUMask_[i]); + } + // check to make sure after ANDing cuMask (custom-defined) with global + //CU mask, we have non-zero mask, oterwise just apply global CU mask + bool zeroCUMask = true; + for (auto m : mask) { + if (m != 0) { + zeroCUMask = false; + break; + } + } + if (zeroCUMask) { + mask = info_.globalCUMask_; + } + } else { + mask = info_.globalCUMask_; } - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "setting custom CU mask 0x%s for hardware queue %p", + + + for (int i = mask.size() - 1; i >= 0; i--) { + ss << mask[i]; + } + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "setting CU mask 0x%s for hardware queue %p", ss.str().c_str(), queue); + hsa_status_t status = HSA_STATUS_SUCCESS; - status = hsa_amd_queue_cu_set_mask(queue, cuMask.size() * 32, cuMask.data()); + status = hsa_amd_queue_cu_set_mask(queue, mask.size() * 32, mask.data()); if (status != HSA_STATUS_SUCCESS) { DevLogError("Device::acquireQueue: hsa_amd_queue_cu_set_mask failed!"); hsa_queue_destroy(queue); return nullptr; } - // add queues with custom CU mask into their special pool to keep track - // of mapping of these queues to their associated queueInfo (i.e., hostcall buffers) - auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo())); - assert(result.second && "QueueInfo already exists"); - auto &qInfo = result.first->second; - qInfo.refCount = 1; + if (cuMask.size() != 0) { + // add queues with custom CU mask into their special pool to keep track + // of mapping of these queues to their associated queueInfo (i.e., hostcall buffers) + auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo())); + assert(result.second && "QueueInfo already exists"); + auto& qInfo = result.first->second; + qInfo.refCount = 1; - return queue; + return queue; + } } + if (coop_queue) { // Skip queue recycling for cooperative queues, since it should be just one // per device. @@ -2660,5 +2698,56 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool, return true; } +void Device::getGlobalCUMask(std::string cuMaskStr) { + if (cuMaskStr.length() != 0) { + std::string pre = cuMaskStr.substr(0, 2); + if (pre.compare("0x") == 0 || pre.compare("0X") == 0) { + cuMaskStr = cuMaskStr.substr(2, cuMaskStr.length()); + } + + int end = cuMaskStr.length(); + + // the number of current physical CUs compressed in 4-bits + size_t compPhysicalCUs = static_cast((settings().enableWgpMode_ ? + info_.maxComputeUnits_ * 2 : info_.maxComputeUnits_)/ 4); + + // the number of final available compute units after applying the requested CU mask + uint32_t availCUs = 0; + + // read numCharToRead characters (8 or less) from the cuMask string each time, convert + // it into hex, and store it into the globalCUMask_. If the length of the cuMask string + // is more than the compressed physical available CUs, ignore the rest + for (unsigned i = 0; i < std::min(cuMaskStr.length(), compPhysicalCUs); i += 8) { + int numCharToRead = (i + 8 <= compPhysicalCUs) ? 8 : compPhysicalCUs - 8; + std::string temp = cuMaskStr.substr(std::max(0, end - numCharToRead), + std::min(numCharToRead, end)); + end -= numCharToRead; + unsigned long ul = 0; + try { + ul = std::stoul(temp, 0, 16); + } catch (const std::invalid_argument&) { + info_.globalCUMask_ = {}; + break; + } + info_.globalCUMask_.push_back(static_cast(ul)); + // count number of set bits in ul to find the number of active CUs + // in each iteration + while (ul) { + ul &= (ul - 1); + availCUs++; + } + } + //update the maxComputeUnits_ based on the requested CU mask + if (availCUs != 0 && availCUs < compPhysicalCUs * 4) { + info_.maxComputeUnits_ = settings().enableWgpMode_ ? + availCUs / 2 : availCUs; + } else { + info_.globalCUMask_ = {}; + } + } else { + info_.globalCUMask_ = {}; + } +} + } // namespace roc #endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 7fc3ebb0f3..39ffcfd84e 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -498,6 +498,8 @@ class Device : public NullDevice { //! Initialize memory in AMD HMM on the current device or keeps it in the host memory bool SvmAllocInit(void* memory, size_t size) const; + void getGlobalCUMask(std::string cuMaskStr); + private: bool SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryAdvice advice, bool first_alloc = false, bool use_cpu = false) const; diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 53852b2d98..ed9828778d 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -249,7 +249,10 @@ release(bool, HIP_FORCE_QUEUE_PROFILING, false, \ release(uint, PAL_FORCE_ASIC_REVISION, 0, \ "Force a specific asic revision for all devices") \ release(bool, PAL_EMBED_KERNEL_MD, false, \ - "Enables writing kernel metadata into command buffers.") + "Enables writing kernel metadata into command buffers.") \ +release(cstring, ROC_GLOBAL_CU_MASK, "", \ + "Sets a global CU mask (entered as hex value) for all queues," \ + "Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") namespace amd {