Add an environment variable for setting a global CU mask

Change-Id: I773b152023c7b8e1e679a42015748f9b23fd946d


[ROCm/clr commit: d03ee6eff6]
此提交包含在:
Aryan Salmanpour
2020-10-29 17:32:15 -04:00
提交者 Aryan Salmanpour
父節點 a1bf89f8d7
當前提交 e8c7cf569f
共有 4 個檔案被更改,包括 110 行新增13 行删除
+3
查看文件
@@ -566,6 +566,9 @@ struct Info : public amd::EmbeddedObject {
uint32_t hmmSupported_; //!< ROCr supports HMM interfaces
uint32_t hmmCpuMemoryAccessible_; //!< CPU memory is accessible by GPU without pinning/register
uint32_t hmmDirectHostAccess_; //!< HMM memory is accessible from the host without migration
//! global CU mask which will be applied to all queues created on this device
std::vector<uint32_t> globalCUMask_;
};
//! Device settings
+101 -12
查看文件
@@ -544,6 +544,11 @@ bool Device::init() {
}
}
// Check to see if a global CU mask is requested
if (amd::IS_HIP && ROC_GLOBAL_CU_MASK[0] != '\0') {
roc_device->getGlobalCUMask(ROC_GLOBAL_CU_MASK);
}
roc_device.release()->registerDevice();
}
@@ -1501,6 +1506,8 @@ bool Device::populateOCLDeviceConstants() {
}
#endif // AMD_HMM_SUPPORT
info_.globalCUMask_ = {};
return true;
}
@@ -2421,30 +2428,61 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
" cooperative: %i", queue, queue_size, queue_priority, coop_queue);
hsa_amd_profiling_set_profiler_enabled(queue, 1);
if (cuMask.size() != 0) {
if (cuMask.size() != 0 || info_.globalCUMask_.size() != 0) {
std::stringstream ss;
ss << std::hex;
for (int i = cuMask.size() - 1; i >= 0; i--) {
ss << cuMask[i];
std::vector<uint32_t> mask = {};
// handle scenarios where cuMask (custom-defined), globalCUMask_ or both are valid and
// fill the final mask which will be appiled to the current queue
if (cuMask.size() != 0 && info_.globalCUMask_.size() == 0) {
mask = cuMask;
} else if (cuMask.size() != 0 && info_.globalCUMask_.size() != 0) {
for (unsigned int i = 0; i < std::min(cuMask.size(), info_.globalCUMask_.size()); i++) {
mask.push_back(cuMask[i] & info_.globalCUMask_[i]);
}
// check to make sure after ANDing cuMask (custom-defined) with global
//CU mask, we have non-zero mask, oterwise just apply global CU mask
bool zeroCUMask = true;
for (auto m : mask) {
if (m != 0) {
zeroCUMask = false;
break;
}
}
if (zeroCUMask) {
mask = info_.globalCUMask_;
}
} else {
mask = info_.globalCUMask_;
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "setting custom CU mask 0x%s for hardware queue %p",
for (int i = mask.size() - 1; i >= 0; i--) {
ss << mask[i];
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "setting CU mask 0x%s for hardware queue %p",
ss.str().c_str(), queue);
hsa_status_t status = HSA_STATUS_SUCCESS;
status = hsa_amd_queue_cu_set_mask(queue, cuMask.size() * 32, cuMask.data());
status = hsa_amd_queue_cu_set_mask(queue, mask.size() * 32, mask.data());
if (status != HSA_STATUS_SUCCESS) {
DevLogError("Device::acquireQueue: hsa_amd_queue_cu_set_mask failed!");
hsa_queue_destroy(queue);
return nullptr;
}
// add queues with custom CU mask into their special pool to keep track
// of mapping of these queues to their associated queueInfo (i.e., hostcall buffers)
auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
assert(result.second && "QueueInfo already exists");
auto &qInfo = result.first->second;
qInfo.refCount = 1;
if (cuMask.size() != 0) {
// add queues with custom CU mask into their special pool to keep track
// of mapping of these queues to their associated queueInfo (i.e., hostcall buffers)
auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
assert(result.second && "QueueInfo already exists");
auto& qInfo = result.first->second;
qInfo.refCount = 1;
return queue;
return queue;
}
}
if (coop_queue) {
// Skip queue recycling for cooperative queues, since it should be just one
// per device.
@@ -2660,5 +2698,56 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
return true;
}
void Device::getGlobalCUMask(std::string cuMaskStr) {
if (cuMaskStr.length() != 0) {
std::string pre = cuMaskStr.substr(0, 2);
if (pre.compare("0x") == 0 || pre.compare("0X") == 0) {
cuMaskStr = cuMaskStr.substr(2, cuMaskStr.length());
}
int end = cuMaskStr.length();
// the number of current physical CUs compressed in 4-bits
size_t compPhysicalCUs = static_cast<size_t>((settings().enableWgpMode_ ?
info_.maxComputeUnits_ * 2 : info_.maxComputeUnits_)/ 4);
// the number of final available compute units after applying the requested CU mask
uint32_t availCUs = 0;
// read numCharToRead characters (8 or less) from the cuMask string each time, convert
// it into hex, and store it into the globalCUMask_. If the length of the cuMask string
// is more than the compressed physical available CUs, ignore the rest
for (unsigned i = 0; i < std::min(cuMaskStr.length(), compPhysicalCUs); i += 8) {
int numCharToRead = (i + 8 <= compPhysicalCUs) ? 8 : compPhysicalCUs - 8;
std::string temp = cuMaskStr.substr(std::max(0, end - numCharToRead),
std::min(numCharToRead, end));
end -= numCharToRead;
unsigned long ul = 0;
try {
ul = std::stoul(temp, 0, 16);
} catch (const std::invalid_argument&) {
info_.globalCUMask_ = {};
break;
}
info_.globalCUMask_.push_back(static_cast<uint32_t>(ul));
// count number of set bits in ul to find the number of active CUs
// in each iteration
while (ul) {
ul &= (ul - 1);
availCUs++;
}
}
//update the maxComputeUnits_ based on the requested CU mask
if (availCUs != 0 && availCUs < compPhysicalCUs * 4) {
info_.maxComputeUnits_ = settings().enableWgpMode_ ?
availCUs / 2 : availCUs;
} else {
info_.globalCUMask_ = {};
}
} else {
info_.globalCUMask_ = {};
}
}
} // namespace roc
#endif // WITHOUT_HSA_BACKEND
+2
查看文件
@@ -498,6 +498,8 @@ class Device : public NullDevice {
//! Initialize memory in AMD HMM on the current device or keeps it in the host memory
bool SvmAllocInit(void* memory, size_t size) const;
void getGlobalCUMask(std::string cuMaskStr);
private:
bool SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryAdvice advice,
bool first_alloc = false, bool use_cpu = false) const;
+4 -1
查看文件
@@ -249,7 +249,10 @@ release(bool, HIP_FORCE_QUEUE_PROFILING, false, \
release(uint, PAL_FORCE_ASIC_REVISION, 0, \
"Force a specific asic revision for all devices") \
release(bool, PAL_EMBED_KERNEL_MD, false, \
"Enables writing kernel metadata into command buffers.")
"Enables writing kernel metadata into command buffers.") \
release(cstring, ROC_GLOBAL_CU_MASK, "", \
"Sets a global CU mask (entered as hex value) for all queues," \
"Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)")
namespace amd {