From 4c6ea88cf5ba489e12df0ddf5d2ca7db4fd6bbc8 Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Tue, 29 Jun 2021 18:03:05 -0500 Subject: [PATCH] Add HSA_CU_MASK New environment variable HSA_CU_MASK allows users to specify a cu mask to every queue allocated from any GPU. hsa_amd_queue_cu_set_mask is restricted from escaping this mask. A new API hsa_amd_queue_cu_get_mask is added to query the current cu mask. Change-Id: I846c03a5faaca9b95067c31db84b59cc9fce2f03 [ROCm/ROCR-Runtime commit: 4455250be1c7a4f5d8c87127b92375a4253e1738] --- .../runtime/hsa-runtime/CMakeLists.txt | 1 + .../core/common/hsa_table_interface.cpp | 6 + .../hsa-runtime/core/inc/amd_aql_queue.h | 17 +- .../hsa-runtime/core/inc/amd_gpu_agent.h | 9 +- .../runtime/hsa-runtime/core/inc/host_queue.h | 8 +- .../hsa-runtime/core/inc/hsa_ext_amd_impl.h | 4 + .../hsa-runtime/core/inc/intercept_queue.h | 5 +- .../runtime/hsa-runtime/core/inc/queue.h | 12 +- .../core/runtime/amd_aql_queue.cpp | 76 ++++++- .../core/runtime/amd_gpu_agent.cpp | 4 +- .../hsa-runtime/core/runtime/amd_topology.cpp | 6 +- .../runtime/hsa-runtime/core/runtime/hsa.cpp | 6 + .../core/runtime/hsa_api_trace.cpp | 1 + .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 21 +- .../runtime/hsa-runtime/core/util/flag.cpp | 204 ++++++++++++++++++ .../runtime/hsa-runtime/core/util/flag.h | 19 +- .../runtime/hsa-runtime/hsacore.so.def | 1 + .../runtime/hsa-runtime/inc/hsa_api_trace.h | 1 + .../runtime/hsa-runtime/inc/hsa_ext_amd.h | 59 ++++- 19 files changed, 430 insertions(+), 30 deletions(-) create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp diff --git a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt index ec0ccdf748..2a8937ce58 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt +++ b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt @@ -151,6 +151,7 @@ set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY LINK_FLAGS ${HSA_SHARED_LINK set ( SRCS core/util/lnx/os_linux.cpp core/util/small_heap.cpp core/util/timer.cpp + core/util/flag.cpp core/runtime/amd_blit_kernel.cpp core/runtime/amd_blit_sdma.cpp core/runtime/amd_cpu_agent.cpp diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index 0ff7b5f8b9..3159af3b4f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -926,6 +926,12 @@ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, queue, num_cu_mask_count, cu_mask); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask) { + return amdExtTable->hsa_amd_queue_cu_get_mask_fn(queue, num_cu_mask_count, cu_mask); +} + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h index 2ac6b3503d..0567731225 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -185,7 +185,16 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo /// @param cu_mask pointer to cu mask /// /// @return hsa_status_t - hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override; + hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override; + + /// @brief Get CU Masking + /// + /// @param num_cu_mask_count size of mask bit array + /// + /// @param cu_mask pointer to cu mask + /// + /// @return hsa_status_t + hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override; // @brief Submits a block of PM4 and waits until it has been executed. void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override; @@ -277,6 +286,12 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo // Exception notification signal Signal* exception_signal_; + // CU mask lock + KernelMutex mask_lock_; + + // Current CU mask + std::vector cu_mask_; + // Shared event used for queue errors static HsaEvent* queue_event_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 152a0e8df6..71b1d3fa44 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -174,7 +174,7 @@ class GpuAgent : public GpuAgentInt { // id. // @param [in] node_props Node property. // @param [in] xnack_mode XNACK mode of device. - GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode); + GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index); // @brief GPU agent destructor. ~GpuAgent(); @@ -322,6 +322,10 @@ class GpuAgent : public GpuAgentInt { return memory_max_frequency_; } + // @brief Order the device is surfaced in hsa_iterate_agents counting only + // GPU devices. + __forceinline uint32_t enumeration_index() const { return enum_index_; } + void Trim() override; protected: @@ -454,6 +458,9 @@ class GpuAgent : public GpuAgentInt { // @brief The GPU memory maximum frequency in MHz. uint32_t memory_max_frequency_; + // @brief Enumeration index + uint32_t enum_index_; + // @brief HDP flush registers hsa_amd_hdp_flush_t HDP_flush_ = {nullptr, nullptr}; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h index 3393ede73a..8521aed7b7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h @@ -144,8 +144,12 @@ class HostQueue : public Queue { std::memory_order_release); } - hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { - return HSA_STATUS_ERROR; + hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + + hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override { + return HSA_STATUS_ERROR_INVALID_QUEUE; } void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index 9954b8fc7a..9ea1b57e55 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -113,6 +113,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, const uint32_t* cu_mask); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); + // Mirrors Amd Extension Apis hsa_status_t hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h index 4ab897fae9..209ee95405 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h @@ -114,9 +114,12 @@ class QueueWrapper : public Queue { uint64_t AddWriteIndexRelease(uint64_t value) override { return wrapped->AddWriteIndexRelease(value); } - hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { + hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { return wrapped->SetCUMasking(num_cu_mask_count, cu_mask); } + hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override { + return wrapped->GetCUMasking(num_cu_mask_count, cu_mask); + } void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override { wrapped->ExecutePM4(cmd_data, cmd_size_b); } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h index 9553a170c9..71866ff8f0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h @@ -295,8 +295,16 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { /// @param cu_mask pointer to cu mask /// /// @return hsa_status_t - virtual hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, - const uint32_t* cu_mask) = 0; + virtual hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) = 0; + + /// @brief Get CU Masking + /// + /// @param num_cu_mask_count size of mask bit array + /// + /// @param cu_mask pointer to cu mask + /// + /// @return hsa_status_t + virtual hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) = 0; // @brief Submits a block of PM4 and waits until it has been executed. virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) = 0; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 19d5cc622d..8e0e602e13 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -304,6 +304,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr exceptionState = ERROR_HANDLER_DONE; } + // Allocate IB for icache flushes. pm4_ib_buf_ = core::Runtime::runtime_singleton_->system_allocator()( pm4_ib_size_b_, 0x1000, core::MemoryRegion::AllocateExecutable); if (pm4_ib_buf_ == nullptr) @@ -313,6 +314,9 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_); }); + // Set initial CU mask + SetCUMasking(0, nullptr); + active_ = true; PM4IBGuard.Dismiss(); @@ -1027,12 +1031,72 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { return false; } -hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count, - const uint32_t* cu_mask) { - HSAKMT_STATUS ret = hsaKmtSetQueueCUMask( - queue_id_, num_cu_mask_count, - reinterpret_cast(const_cast(cu_mask))); - return (HSAKMT_STATUS_SUCCESS == ret) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; +hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) { + uint32_t cu_count; + agent_->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count); + size_t mask_dwords = (cu_count + 31) / 32; + // Mask to trim the last uint32_t in cu_mask to the physical CU count + uint32_t tail_mask = (1 << (cu_count % 32)) - 1; + + auto global_mask = core::Runtime::runtime_singleton_->flag().cu_mask(agent_->enumeration_index()); + std::vector mask; + + bool clipped = false; + + // num_cu_mask_count = 0 resets the CU mask. + if (num_cu_mask_count == 0) { + for (int i = 0; i < mask_dwords; i++) mask.push_back(-1); + } else { + for (int i = 0; i < num_cu_mask_count / 32; i++) mask.push_back(cu_mask[i]); + } + + // Apply global mask to user mask + if (!global_mask.empty()) { + // Limit mask processing to smallest needed dword range + size_t limit = Min(global_mask.size(), mask.size(), mask_dwords); + + // Check for disabling requested cus. + for (int i = limit; i < mask.size(); i++) { + if (mask[i] != 0) { + clipped = true; + break; + } + } + + mask.resize(limit, 0); + for (size_t i = 0; i < limit; i++) { + clipped |= ((mask[i] & (~global_mask[i])) != 0); + mask[i] &= global_mask[i]; + } + } else { + // Limit to physical CU range only + size_t limit = Min(mask.size(), mask_dwords); + mask.resize(limit, 0); + } + + // Clip last dword to physical CU limit if necessary + if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask; + + // Apply mask and update current cu masking tracking. + ScopedAcquire lock(&mask_lock_); + HSAKMT_STATUS ret = + hsaKmtSetQueueCUMask(queue_id_, mask.size() * 32, reinterpret_cast(&mask[0])); + if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR; + cu_mask_ = std::move(mask); + return clipped ? (hsa_status_t)HSA_STATUS_CU_MASK_REDUCED : HSA_STATUS_SUCCESS; +} + +hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) { + ScopedAcquire lock(&mask_lock_); + assert(!cu_mask_.empty() && "No current cu_mask!"); + + uint32_t user_dword_count = num_cu_mask_count / 32; + if (user_dword_count > cu_mask_.size()) { + memset(&cu_mask[cu_mask_.size()], 0, sizeof(uint32_t) * (user_dword_count - cu_mask_.size())); + user_dword_count = cu_mask_.size(); + } + memcpy(cu_mask, &cu_mask_[0], sizeof(uint32_t) * user_dword_count); + return HSA_STATUS_SUCCESS; } void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 84cf857e61..da7932d951 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -77,7 +77,8 @@ extern HsaApiTable hsa_internal_api_table_; } // namespace core namespace AMD { -GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode) +GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, + uint32_t index) : GpuAgentInt(node), properties_(node_props), current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), @@ -89,6 +90,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna doorbell_queue_map_(NULL), memory_bus_width_(0), memory_max_frequency_(0), + enum_index_(index), ape1_base_(0), ape1_size_(0), scratch_cache_( diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp index bd8e9362bd..001b58feb0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -121,7 +121,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac return nullptr; } try { - gpu = new GpuAgent(node_id, node_prop, xnack_mode); + gpu = new GpuAgent(node_id, node_prop, xnack_mode, + core::Runtime::runtime_singleton_->gpu_agents().size()); const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version; @@ -146,7 +147,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac if (gpu->isa()->GetProcessorName() == "gfx908") { node_prop.Capability.ui32.SRAM_EDCSupport = 1; delete gpu; - gpu = new GpuAgent(node_id, node_prop, xnack_mode); + gpu = new GpuAgent(node_id, node_prop, xnack_mode, + core::Runtime::runtime_singleton_->gpu_agents().size()); } } } catch (const hsa_exception& e) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp index b2d1f6537c..cc2c8f7841 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp @@ -2762,6 +2762,12 @@ hsa_status_t hsa_status_string( *status_string = "HSA_STATUS_ERROR_MEMORY_FAULT: Agent attempted to access an inaccessible address."; break; + case HSA_STATUS_CU_MASK_REDUCED: + *status_string = + "HSA_STATUS_CU_MASK_REDUCED: The CU mask was successfully set but the mask attempted to " + "enable a CU which was disabled for the process. CUs disabled for the process remain " + "disabled."; + break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index 605ec15aec..ab6a1f31b9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -359,6 +359,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function; amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any; amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask; + amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask; amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info; amd_ext_api.hsa_amd_agent_iterate_memory_pools_fn = AMD::hsa_amd_agent_iterate_memory_pools; amd_ext_api.hsa_amd_memory_pool_allocate_fn = AMD::hsa_amd_memory_pool_allocate; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 66d7ff7278..843a588ff0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -535,19 +535,34 @@ hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) { CATCH; } -hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, - uint32_t num_cu_mask_count, - const uint32_t* cu_mask) { +hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + const uint32_t* cu_mask) { TRY; IS_OPEN(); IS_BAD_PTR(cu_mask); core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); + if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0)) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask); CATCH; } +hsa_status_t hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask) { + TRY; + IS_OPEN(); + IS_BAD_PTR(cu_mask); + + core::Queue* cmd_queue = core::Queue::Convert(queue); + IS_VALID(cmd_queue); + if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0)) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + return cmd_queue->GetCUMasking(num_cu_mask_count, cu_mask); + CATCH; +} + hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, hsa_agent_t* agents, int num_agent, void** agent_ptr) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp new file mode 100644 index 0000000000..8a8c6e6e00 --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp @@ -0,0 +1,204 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2021-2021, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/util/flag.h" +#include "core/util/utils.h" + +#include +#include +#include +#include +#include + +namespace rocr { + +// split at separators +static std::vector split(std::string& str, char sep) { + std::vector ret; + while (!str.empty()) { + size_t pos = str.find(sep); + if (pos == std::string::npos) { + ret.push_back(str); + return ret; + } + ret.push_back(str.substr(0, pos)); + str.erase(0, pos + 1); + } + return ret; +}; + +// Parse id,id-id,... strings into id lists +static std::vector get_elements(std::string& str) { + std::vector ret; + MAKE_NAMED_SCOPE_GUARD(error, [&]() { ret.clear(); }); + + std::vector ranges = split(str, ','); + for (auto& str : ranges) { + auto range = split(str, '-'); + // failure, too many -'s. + if (range.size() > 2) return ret; + + char* end; + uint32_t index = strtoul(range[0].c_str(), &end, 10); + // Invalid syntax - id's must be base 10 digits only. + if (*end != '\0') return ret; + ret.push_back(index); + + if (range.size() == 2) { + uint32_t secondindex = strtoul(range[1].c_str(), &end, 10); + if (*end != '\0') return ret; // bad syntax + if (secondindex < index) return ret; // inverted range + for (uint32_t i = index + 1; i < secondindex + 1; i++) ret.push_back(i); + } + } + + // Confirm no duplicate ids. + std::sort(ret.begin(), ret.end()); + if (std::adjacent_find(ret.begin(), ret.end()) != ret.end()) return ret; + + // Good parse, keep result. + error.Dismiss(); + return ret; +}; + +/* +Parse env var per the following syntax, all whitespace is ignored: + +ID = [0-9][0-9]* ex. base 10 numbers +ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7 +GPU_list = ID_list ex. 0,2-4,7 +CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7 +CU_Set = GPU_list : CU_list ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F +HSA_CU_MASK = CU_Set [; CU_Set]* ex. 0,2-4,7:0-15,32-47; 3-9:0x337F + +GPU indexes are taken post ROCM_VISIBLE_DEVICES reordering. +Listed or bit set CUs will be enabled at queue creation on the associated GPU. +All other CUs on the associated GPUs will be disabled. +CU masks of unlisted GPUs are not restricted. + +Repeating a GPU or CU ID is a syntax error. +Parsing stops at the first CU_Set that has a syntax error, that set and all +following sets are ignored. +Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error. +Users should use ROCM_VISIBLE_DEVICES if they want to exclude use of a +particular GPU. +*/ +void Flag::parse_masks(std::string& var) { + if (var.empty()) return; + + // Remove whitespace + auto end = std::remove_if(var.begin(), var.end(), + [](char c) { return std::isspace(c, std::locale::classic()); }); + var.erase(end, var.end()); + + // Switch to uppercase + for (auto& c : var) c = toupper(c); + + // Iterate over cu sets + auto sets = split(var, ';'); + for (auto& set : sets) { + auto parts = split(set, ':'); + if (parts.size() != 2) return; + + // temp storage for cu_set parsing. + std::vector gpu_index; + std::vector mask; + + // parse cu list first, check for bitmask format + if (parts[1][1] == 'X') { + // Confirm hex format and strip prefix + auto& cu = parts[1]; + if (cu[0] != '0') return; + cu.erase(0, 2); + + // Ensure all valid hex characters + for (auto& c : cu) { + if (!isxdigit(c)) return; + } + + // Convert to uint32_t, lsb first. + size_t len = cu.length(); + while (len != 0) { + size_t trim = Min(len, size_t(8)); + len -= trim; + auto tmp = cu.substr(len, trim); + auto chunk = stoul(tmp, nullptr, 16); + mask.push_back(chunk); + } + + // Trim leading zeros + while (!mask.empty() && mask.back() == 0) mask.pop_back(); + + // Mask 0x0 is an error. + if (mask.empty()) return; + + } else { + // parse cu lists + auto cu_indices = get_elements(parts[1]); + if (cu_indices.empty()) return; + uint32_t maxdword = cu_indices.back() / 32 + 1; + mask.resize(maxdword, 0); + for (auto id : cu_indices) { + uint32_t index, offset; + index = id / 32; + offset = id % 32; + mask[index] |= 1ul << offset; + } + } + + // parse device list + gpu_index = get_elements(parts[0]); + if (gpu_index.empty()) return; + + // Ensure that no GPU was repeated across cu_sets + for (auto id : gpu_index) { + if (cu_mask_.find(id) != cu_mask_.end()) return; + } + + // Insert into map + for (auto id : gpu_index) { + cu_mask_[id] = mask; + } + } +} + +} // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index c7149363ad..b8d996c1b8 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -3,7 +3,7 @@ // The University of Illinois/NCSA // Open Source License (NCSA) // -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // @@ -45,6 +45,8 @@ #include +#include +#include #include #include "core/util/os.h" @@ -149,6 +151,9 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_DEBUG"); debug_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_CU_MASK"); + parse_masks(var); } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -206,6 +211,13 @@ class Flag { bool debug() const { return debug_; } + const std::vector& cu_mask(uint32_t gpu_index) const { + static const std::vector empty; + auto it = cu_mask_.find(gpu_index); + if (it == cu_mask_.end()) return empty; + return it->second; + } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -243,6 +255,11 @@ class Flag { // Indicates user preference for Xnack state. XNACK_REQUEST xnack_; + // Map GPU index post RVD to its default cu mask. + std::map> cu_mask_; + + void parse_masks(std::string& args); + DISALLOW_COPY_AND_ASSIGN(Flag); }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def index eb853410e5..16aa30b3e3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def +++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def @@ -179,6 +179,7 @@ global: hsa_amd_async_function; hsa_amd_image_get_info_max_dim; hsa_amd_queue_cu_set_mask; + hsa_amd_queue_cu_get_mask; hsa_amd_memory_fill; hsa_amd_memory_async_copy; hsa_amd_memory_async_copy_rect; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h index 35dd21bfa5..451204f412 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -186,6 +186,7 @@ struct AmdExtTable { decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; + decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn; }; // Table to export HSA Core Runtime Apis diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h index b1ba20f631..7c1fc16af2 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -191,6 +191,13 @@ enum { * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses. */ HSA_STATUS_ERROR_MEMORY_FAULT = 43, + + /** + * The CU mask was successfully set but the mask attempted to enable a CU + * which was disabled for the process. CUs disabled for the process remain + * disabled. + */ + HSA_STATUS_CU_MASK_REDUCED = 44, }; /** @@ -780,31 +787,63 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, void* value); /** - * @brief Set a CU affinity to specific queues within the process, this function - * call is "atomic". + * @brief Set a queue's CU affinity mask. + * + * @details Enables the queue to run on only selected CUs. The given mask is + * combined by bitwise AND with any device wide mask in HSA_CU_MASK before + * being applied. * * @param[in] queue A pointer to HSA queue. * - * @param[in] num_cu_mask_count Size of CUMask bit array passed in. + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. * * @param[in] cu_mask Bit-vector representing the CU mask. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * + * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed + * but the given mask attempted to enable a CU which was disabled by + * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +/** + * @brief Retrieve a queue's CU affinity mask. + * + * @details Returns the first num_cu_mask_count bits of a queue's CU mask. + * Ensure that num_cu_mask_count is at least as large as + * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[out] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. * - * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not - * multiple of 32 or @p cu_mask is NULL. - * - * @retval ::HSA_STATUS_ERROR failed to call thunk api + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. * */ -hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, - uint32_t num_cu_mask_count, - const uint32_t* cu_mask); +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); /** * @brief Memory segments associated with a memory pool.