Add HSA_CU_MASK
New environment variable HSA_CU_MASK allows users to
specify a cu mask to every queue allocated from any
GPU. hsa_amd_queue_cu_set_mask is restricted from
escaping this mask.
A new API hsa_amd_queue_cu_get_mask is added to query
the current cu mask.
Change-Id: I846c03a5faaca9b95067c31db84b59cc9fce2f03
[ROCm/ROCR-Runtime commit: 4455250be1]
This commit is contained in:
@@ -151,6 +151,7 @@ set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY LINK_FLAGS ${HSA_SHARED_LINK
|
||||
set ( SRCS core/util/lnx/os_linux.cpp
|
||||
core/util/small_heap.cpp
|
||||
core/util/timer.cpp
|
||||
core/util/flag.cpp
|
||||
core/runtime/amd_blit_kernel.cpp
|
||||
core/runtime/amd_blit_sdma.cpp
|
||||
core/runtime/amd_cpu_agent.cpp
|
||||
|
||||
@@ -926,6 +926,12 @@ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
|
||||
queue, num_cu_mask_count, cu_mask);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
|
||||
uint32_t* cu_mask) {
|
||||
return amdExtTable->hsa_amd_queue_cu_get_mask_fn(queue, num_cu_mask_count, cu_mask);
|
||||
}
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API
|
||||
hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
|
||||
|
||||
@@ -185,7 +185,16 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
/// @param cu_mask pointer to cu mask
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override;
|
||||
hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override;
|
||||
|
||||
/// @brief Get CU Masking
|
||||
///
|
||||
/// @param num_cu_mask_count size of mask bit array
|
||||
///
|
||||
/// @param cu_mask pointer to cu mask
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override;
|
||||
|
||||
// @brief Submits a block of PM4 and waits until it has been executed.
|
||||
void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;
|
||||
@@ -277,6 +286,12 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
// Exception notification signal
|
||||
Signal* exception_signal_;
|
||||
|
||||
// CU mask lock
|
||||
KernelMutex mask_lock_;
|
||||
|
||||
// Current CU mask
|
||||
std::vector<uint32_t> cu_mask_;
|
||||
|
||||
// Shared event used for queue errors
|
||||
static HsaEvent* queue_event_;
|
||||
|
||||
|
||||
@@ -174,7 +174,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
// id.
|
||||
// @param [in] node_props Node property.
|
||||
// @param [in] xnack_mode XNACK mode of device.
|
||||
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode);
|
||||
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);
|
||||
|
||||
// @brief GPU agent destructor.
|
||||
~GpuAgent();
|
||||
@@ -322,6 +322,10 @@ class GpuAgent : public GpuAgentInt {
|
||||
return memory_max_frequency_;
|
||||
}
|
||||
|
||||
// @brief Order the device is surfaced in hsa_iterate_agents counting only
|
||||
// GPU devices.
|
||||
__forceinline uint32_t enumeration_index() const { return enum_index_; }
|
||||
|
||||
void Trim() override;
|
||||
|
||||
protected:
|
||||
@@ -454,6 +458,9 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief The GPU memory maximum frequency in MHz.
|
||||
uint32_t memory_max_frequency_;
|
||||
|
||||
// @brief Enumeration index
|
||||
uint32_t enum_index_;
|
||||
|
||||
// @brief HDP flush registers
|
||||
hsa_amd_hdp_flush_t HDP_flush_ = {nullptr, nullptr};
|
||||
|
||||
|
||||
@@ -144,8 +144,12 @@ class HostQueue : public Queue {
|
||||
std::memory_order_release);
|
||||
}
|
||||
|
||||
hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
|
||||
return HSA_STATUS_ERROR;
|
||||
hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
|
||||
|
||||
@@ -113,6 +113,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
|
||||
uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
|
||||
uint32_t* cu_mask);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t
|
||||
hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
|
||||
|
||||
@@ -114,9 +114,12 @@ class QueueWrapper : public Queue {
|
||||
uint64_t AddWriteIndexRelease(uint64_t value) override {
|
||||
return wrapped->AddWriteIndexRelease(value);
|
||||
}
|
||||
hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
|
||||
hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
|
||||
return wrapped->SetCUMasking(num_cu_mask_count, cu_mask);
|
||||
}
|
||||
hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
|
||||
return wrapped->GetCUMasking(num_cu_mask_count, cu_mask);
|
||||
}
|
||||
void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
|
||||
wrapped->ExecutePM4(cmd_data, cmd_size_b);
|
||||
}
|
||||
|
||||
@@ -295,8 +295,16 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
|
||||
/// @param cu_mask pointer to cu mask
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
virtual hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask) = 0;
|
||||
virtual hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) = 0;
|
||||
|
||||
/// @brief Get CU Masking
|
||||
///
|
||||
/// @param num_cu_mask_count size of mask bit array
|
||||
///
|
||||
/// @param cu_mask pointer to cu mask
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
virtual hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) = 0;
|
||||
|
||||
// @brief Submits a block of PM4 and waits until it has been executed.
|
||||
virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) = 0;
|
||||
|
||||
@@ -304,6 +304,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
|
||||
exceptionState = ERROR_HANDLER_DONE;
|
||||
}
|
||||
|
||||
// Allocate IB for icache flushes.
|
||||
pm4_ib_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
|
||||
pm4_ib_size_b_, 0x1000, core::MemoryRegion::AllocateExecutable);
|
||||
if (pm4_ib_buf_ == nullptr)
|
||||
@@ -313,6 +314,9 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
|
||||
core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
|
||||
});
|
||||
|
||||
// Set initial CU mask
|
||||
SetCUMasking(0, nullptr);
|
||||
|
||||
active_ = true;
|
||||
|
||||
PM4IBGuard.Dismiss();
|
||||
@@ -1027,12 +1031,72 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask) {
|
||||
HSAKMT_STATUS ret = hsaKmtSetQueueCUMask(
|
||||
queue_id_, num_cu_mask_count,
|
||||
reinterpret_cast<HSAuint32*>(const_cast<uint32_t*>(cu_mask)));
|
||||
return (HSAKMT_STATUS_SUCCESS == ret) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
|
||||
hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) {
|
||||
uint32_t cu_count;
|
||||
agent_->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
|
||||
size_t mask_dwords = (cu_count + 31) / 32;
|
||||
// Mask to trim the last uint32_t in cu_mask to the physical CU count
|
||||
uint32_t tail_mask = (1 << (cu_count % 32)) - 1;
|
||||
|
||||
auto global_mask = core::Runtime::runtime_singleton_->flag().cu_mask(agent_->enumeration_index());
|
||||
std::vector<uint32_t> mask;
|
||||
|
||||
bool clipped = false;
|
||||
|
||||
// num_cu_mask_count = 0 resets the CU mask.
|
||||
if (num_cu_mask_count == 0) {
|
||||
for (int i = 0; i < mask_dwords; i++) mask.push_back(-1);
|
||||
} else {
|
||||
for (int i = 0; i < num_cu_mask_count / 32; i++) mask.push_back(cu_mask[i]);
|
||||
}
|
||||
|
||||
// Apply global mask to user mask
|
||||
if (!global_mask.empty()) {
|
||||
// Limit mask processing to smallest needed dword range
|
||||
size_t limit = Min(global_mask.size(), mask.size(), mask_dwords);
|
||||
|
||||
// Check for disabling requested cus.
|
||||
for (int i = limit; i < mask.size(); i++) {
|
||||
if (mask[i] != 0) {
|
||||
clipped = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mask.resize(limit, 0);
|
||||
for (size_t i = 0; i < limit; i++) {
|
||||
clipped |= ((mask[i] & (~global_mask[i])) != 0);
|
||||
mask[i] &= global_mask[i];
|
||||
}
|
||||
} else {
|
||||
// Limit to physical CU range only
|
||||
size_t limit = Min(mask.size(), mask_dwords);
|
||||
mask.resize(limit, 0);
|
||||
}
|
||||
|
||||
// Clip last dword to physical CU limit if necessary
|
||||
if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;
|
||||
|
||||
// Apply mask and update current cu masking tracking.
|
||||
ScopedAcquire<KernelMutex> lock(&mask_lock_);
|
||||
HSAKMT_STATUS ret =
|
||||
hsaKmtSetQueueCUMask(queue_id_, mask.size() * 32, reinterpret_cast<HSAuint32*>(&mask[0]));
|
||||
if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR;
|
||||
cu_mask_ = std::move(mask);
|
||||
return clipped ? (hsa_status_t)HSA_STATUS_CU_MASK_REDUCED : HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
|
||||
ScopedAcquire<KernelMutex> lock(&mask_lock_);
|
||||
assert(!cu_mask_.empty() && "No current cu_mask!");
|
||||
|
||||
uint32_t user_dword_count = num_cu_mask_count / 32;
|
||||
if (user_dword_count > cu_mask_.size()) {
|
||||
memset(&cu_mask[cu_mask_.size()], 0, sizeof(uint32_t) * (user_dword_count - cu_mask_.size()));
|
||||
user_dword_count = cu_mask_.size();
|
||||
}
|
||||
memcpy(cu_mask, &cu_mask_[0], sizeof(uint32_t) * user_dword_count);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
|
||||
|
||||
@@ -77,7 +77,8 @@ extern HsaApiTable hsa_internal_api_table_;
|
||||
} // namespace core
|
||||
|
||||
namespace AMD {
|
||||
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode)
|
||||
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode,
|
||||
uint32_t index)
|
||||
: GpuAgentInt(node),
|
||||
properties_(node_props),
|
||||
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
|
||||
@@ -89,6 +90,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
|
||||
doorbell_queue_map_(NULL),
|
||||
memory_bus_width_(0),
|
||||
memory_max_frequency_(0),
|
||||
enum_index_(index),
|
||||
ape1_base_(0),
|
||||
ape1_size_(0),
|
||||
scratch_cache_(
|
||||
|
||||
@@ -121,7 +121,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
|
||||
return nullptr;
|
||||
}
|
||||
try {
|
||||
gpu = new GpuAgent(node_id, node_prop, xnack_mode);
|
||||
gpu = new GpuAgent(node_id, node_prop, xnack_mode,
|
||||
core::Runtime::runtime_singleton_->gpu_agents().size());
|
||||
|
||||
const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;
|
||||
|
||||
@@ -146,7 +147,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
|
||||
if (gpu->isa()->GetProcessorName() == "gfx908") {
|
||||
node_prop.Capability.ui32.SRAM_EDCSupport = 1;
|
||||
delete gpu;
|
||||
gpu = new GpuAgent(node_id, node_prop, xnack_mode);
|
||||
gpu = new GpuAgent(node_id, node_prop, xnack_mode,
|
||||
core::Runtime::runtime_singleton_->gpu_agents().size());
|
||||
}
|
||||
}
|
||||
} catch (const hsa_exception& e) {
|
||||
|
||||
@@ -2762,6 +2762,12 @@ hsa_status_t hsa_status_string(
|
||||
*status_string =
|
||||
"HSA_STATUS_ERROR_MEMORY_FAULT: Agent attempted to access an inaccessible address.";
|
||||
break;
|
||||
case HSA_STATUS_CU_MASK_REDUCED:
|
||||
*status_string =
|
||||
"HSA_STATUS_CU_MASK_REDUCED: The CU mask was successfully set but the mask attempted to "
|
||||
"enable a CU which was disabled for the process. CUs disabled for the process remain "
|
||||
"disabled.";
|
||||
break;
|
||||
default:
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
@@ -359,6 +359,7 @@ void HsaApiTable::UpdateAmdExts() {
|
||||
amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function;
|
||||
amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any;
|
||||
amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask;
|
||||
amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask;
|
||||
amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info;
|
||||
amd_ext_api.hsa_amd_agent_iterate_memory_pools_fn = AMD::hsa_amd_agent_iterate_memory_pools;
|
||||
amd_ext_api.hsa_amd_memory_pool_allocate_fn = AMD::hsa_amd_memory_pool_allocate;
|
||||
|
||||
@@ -535,19 +535,34 @@ hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) {
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
|
||||
uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask) {
|
||||
hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
IS_BAD_PTR(cu_mask);
|
||||
|
||||
core::Queue* cmd_queue = core::Queue::Convert(queue);
|
||||
IS_VALID(cmd_queue);
|
||||
if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0))
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
|
||||
uint32_t* cu_mask) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
IS_BAD_PTR(cu_mask);
|
||||
|
||||
core::Queue* cmd_queue = core::Queue::Convert(queue);
|
||||
IS_VALID(cmd_queue);
|
||||
if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0))
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
return cmd_queue->GetCUMasking(num_cu_mask_count, cu_mask);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
|
||||
hsa_agent_t* agents, int num_agent,
|
||||
void** agent_ptr) {
|
||||
|
||||
@@ -0,0 +1,204 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2021-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "core/util/flag.h"
|
||||
#include "core/util/utils.h"
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <locale>
|
||||
|
||||
namespace rocr {
|
||||
|
||||
// split at separators
|
||||
static std::vector<std::string> split(std::string& str, char sep) {
|
||||
std::vector<std::string> ret;
|
||||
while (!str.empty()) {
|
||||
size_t pos = str.find(sep);
|
||||
if (pos == std::string::npos) {
|
||||
ret.push_back(str);
|
||||
return ret;
|
||||
}
|
||||
ret.push_back(str.substr(0, pos));
|
||||
str.erase(0, pos + 1);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Parse id,id-id,... strings into id lists
|
||||
static std::vector<uint32_t> get_elements(std::string& str) {
|
||||
std::vector<uint32_t> ret;
|
||||
MAKE_NAMED_SCOPE_GUARD(error, [&]() { ret.clear(); });
|
||||
|
||||
std::vector<std::string> ranges = split(str, ',');
|
||||
for (auto& str : ranges) {
|
||||
auto range = split(str, '-');
|
||||
// failure, too many -'s.
|
||||
if (range.size() > 2) return ret;
|
||||
|
||||
char* end;
|
||||
uint32_t index = strtoul(range[0].c_str(), &end, 10);
|
||||
// Invalid syntax - id's must be base 10 digits only.
|
||||
if (*end != '\0') return ret;
|
||||
ret.push_back(index);
|
||||
|
||||
if (range.size() == 2) {
|
||||
uint32_t secondindex = strtoul(range[1].c_str(), &end, 10);
|
||||
if (*end != '\0') return ret; // bad syntax
|
||||
if (secondindex < index) return ret; // inverted range
|
||||
for (uint32_t i = index + 1; i < secondindex + 1; i++) ret.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Confirm no duplicate ids.
|
||||
std::sort(ret.begin(), ret.end());
|
||||
if (std::adjacent_find(ret.begin(), ret.end()) != ret.end()) return ret;
|
||||
|
||||
// Good parse, keep result.
|
||||
error.Dismiss();
|
||||
return ret;
|
||||
};
|
||||
|
||||
/*
|
||||
Parse env var per the following syntax, all whitespace is ignored:
|
||||
|
||||
ID = [0-9][0-9]* ex. base 10 numbers
|
||||
ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7
|
||||
GPU_list = ID_list ex. 0,2-4,7
|
||||
CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7
|
||||
CU_Set = GPU_list : CU_list ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
|
||||
HSA_CU_MASK = CU_Set [; CU_Set]* ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
|
||||
|
||||
GPU indexes are taken post ROCM_VISIBLE_DEVICES reordering.
|
||||
Listed or bit set CUs will be enabled at queue creation on the associated GPU.
|
||||
All other CUs on the associated GPUs will be disabled.
|
||||
CU masks of unlisted GPUs are not restricted.
|
||||
|
||||
Repeating a GPU or CU ID is a syntax error.
|
||||
Parsing stops at the first CU_Set that has a syntax error, that set and all
|
||||
following sets are ignored.
|
||||
Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error.
|
||||
Users should use ROCM_VISIBLE_DEVICES if they want to exclude use of a
|
||||
particular GPU.
|
||||
*/
|
||||
void Flag::parse_masks(std::string& var) {
|
||||
if (var.empty()) return;
|
||||
|
||||
// Remove whitespace
|
||||
auto end = std::remove_if(var.begin(), var.end(),
|
||||
[](char c) { return std::isspace<char>(c, std::locale::classic()); });
|
||||
var.erase(end, var.end());
|
||||
|
||||
// Switch to uppercase
|
||||
for (auto& c : var) c = toupper(c);
|
||||
|
||||
// Iterate over cu sets
|
||||
auto sets = split(var, ';');
|
||||
for (auto& set : sets) {
|
||||
auto parts = split(set, ':');
|
||||
if (parts.size() != 2) return;
|
||||
|
||||
// temp storage for cu_set parsing.
|
||||
std::vector<uint32_t> gpu_index;
|
||||
std::vector<uint32_t> mask;
|
||||
|
||||
// parse cu list first, check for bitmask format
|
||||
if (parts[1][1] == 'X') {
|
||||
// Confirm hex format and strip prefix
|
||||
auto& cu = parts[1];
|
||||
if (cu[0] != '0') return;
|
||||
cu.erase(0, 2);
|
||||
|
||||
// Ensure all valid hex characters
|
||||
for (auto& c : cu) {
|
||||
if (!isxdigit(c)) return;
|
||||
}
|
||||
|
||||
// Convert to uint32_t, lsb first.
|
||||
size_t len = cu.length();
|
||||
while (len != 0) {
|
||||
size_t trim = Min(len, size_t(8));
|
||||
len -= trim;
|
||||
auto tmp = cu.substr(len, trim);
|
||||
auto chunk = stoul(tmp, nullptr, 16);
|
||||
mask.push_back(chunk);
|
||||
}
|
||||
|
||||
// Trim leading zeros
|
||||
while (!mask.empty() && mask.back() == 0) mask.pop_back();
|
||||
|
||||
// Mask 0x0 is an error.
|
||||
if (mask.empty()) return;
|
||||
|
||||
} else {
|
||||
// parse cu lists
|
||||
auto cu_indices = get_elements(parts[1]);
|
||||
if (cu_indices.empty()) return;
|
||||
uint32_t maxdword = cu_indices.back() / 32 + 1;
|
||||
mask.resize(maxdword, 0);
|
||||
for (auto id : cu_indices) {
|
||||
uint32_t index, offset;
|
||||
index = id / 32;
|
||||
offset = id % 32;
|
||||
mask[index] |= 1ul << offset;
|
||||
}
|
||||
}
|
||||
|
||||
// parse device list
|
||||
gpu_index = get_elements(parts[0]);
|
||||
if (gpu_index.empty()) return;
|
||||
|
||||
// Ensure that no GPU was repeated across cu_sets
|
||||
for (auto id : gpu_index) {
|
||||
if (cu_mask_.find(id) != cu_mask_.end()) return;
|
||||
}
|
||||
|
||||
// Insert into map
|
||||
for (auto id : gpu_index) {
|
||||
cu_mask_[id] = mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocr
|
||||
@@ -3,7 +3,7 @@
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
@@ -45,6 +45,8 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "core/util/os.h"
|
||||
@@ -149,6 +151,9 @@ class Flag {
|
||||
|
||||
var = os::GetEnvVar("HSA_ENABLE_DEBUG");
|
||||
debug_ = (var == "1") ? true : false;
|
||||
|
||||
var = os::GetEnvVar("HSA_CU_MASK");
|
||||
parse_masks(var);
|
||||
}
|
||||
|
||||
bool check_flat_scratch() const { return check_flat_scratch_; }
|
||||
@@ -206,6 +211,13 @@ class Flag {
|
||||
|
||||
bool debug() const { return debug_; }
|
||||
|
||||
const std::vector<uint32_t>& cu_mask(uint32_t gpu_index) const {
|
||||
static const std::vector<uint32_t> empty;
|
||||
auto it = cu_mask_.find(gpu_index);
|
||||
if (it == cu_mask_.end()) return empty;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
private:
|
||||
bool check_flat_scratch_;
|
||||
bool enable_vm_fault_message_;
|
||||
@@ -243,6 +255,11 @@ class Flag {
|
||||
// Indicates user preference for Xnack state.
|
||||
XNACK_REQUEST xnack_;
|
||||
|
||||
// Map GPU index post RVD to its default cu mask.
|
||||
std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
|
||||
|
||||
void parse_masks(std::string& args);
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(Flag);
|
||||
};
|
||||
|
||||
|
||||
@@ -179,6 +179,7 @@ global:
|
||||
hsa_amd_async_function;
|
||||
hsa_amd_image_get_info_max_dim;
|
||||
hsa_amd_queue_cu_set_mask;
|
||||
hsa_amd_queue_cu_get_mask;
|
||||
hsa_amd_memory_fill;
|
||||
hsa_amd_memory_async_copy;
|
||||
hsa_amd_memory_async_copy_rect;
|
||||
|
||||
@@ -186,6 +186,7 @@ struct AmdExtTable {
|
||||
decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
|
||||
decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
|
||||
decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
|
||||
decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn;
|
||||
};
|
||||
|
||||
// Table to export HSA Core Runtime Apis
|
||||
|
||||
@@ -191,6 +191,13 @@ enum {
|
||||
* HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
|
||||
*/
|
||||
HSA_STATUS_ERROR_MEMORY_FAULT = 43,
|
||||
|
||||
/**
|
||||
* The CU mask was successfully set but the mask attempted to enable a CU
|
||||
* which was disabled for the process. CUs disabled for the process remain
|
||||
* disabled.
|
||||
*/
|
||||
HSA_STATUS_CU_MASK_REDUCED = 44,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -780,31 +787,63 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
|
||||
void* value);
|
||||
|
||||
/**
|
||||
* @brief Set a CU affinity to specific queues within the process, this function
|
||||
* call is "atomic".
|
||||
* @brief Set a queue's CU affinity mask.
|
||||
*
|
||||
* @details Enables the queue to run on only selected CUs. The given mask is
|
||||
* combined by bitwise AND with any device wide mask in HSA_CU_MASK before
|
||||
* being applied.
|
||||
*
|
||||
* @param[in] queue A pointer to HSA queue.
|
||||
*
|
||||
* @param[in] num_cu_mask_count Size of CUMask bit array passed in.
|
||||
* @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
|
||||
*
|
||||
* @param[in] cu_mask Bit-vector representing the CU mask.
|
||||
*
|
||||
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
|
||||
*
|
||||
* @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
|
||||
* but the given mask attempted to enable a CU which was disabled by
|
||||
* HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
|
||||
* initialized.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
|
||||
* a multiple of 32 or @p cu_mask is NULL.
|
||||
*
|
||||
*/
|
||||
hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
|
||||
uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask);
|
||||
|
||||
/**
|
||||
* @brief Retrieve a queue's CU affinity mask.
|
||||
*
|
||||
* @details Returns the first num_cu_mask_count bits of a queue's CU mask.
|
||||
* Ensure that num_cu_mask_count is at least as large as
|
||||
* HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
|
||||
*
|
||||
* @param[in] queue A pointer to HSA queue.
|
||||
*
|
||||
* @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
|
||||
*
|
||||
* @param[out] cu_mask Bit-vector representing the CU mask.
|
||||
*
|
||||
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
|
||||
* initialized.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
|
||||
* multiple of 32 or @p cu_mask is NULL.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR failed to call thunk api
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
|
||||
* a multiple of 32 or @p cu_mask is NULL.
|
||||
*
|
||||
*/
|
||||
hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
|
||||
uint32_t num_cu_mask_count,
|
||||
const uint32_t* cu_mask);
|
||||
hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
|
||||
uint32_t* cu_mask);
|
||||
|
||||
/**
|
||||
* @brief Memory segments associated with a memory pool.
|
||||
|
||||
Reference in New Issue
Block a user