From 4c6ea88cf5ba489e12df0ddf5d2ca7db4fd6bbc8 Mon Sep 17 00:00:00 2001
From: Sean Keely <Sean.Keely@amd.com>
Date: Tue, 29 Jun 2021 18:03:05 -0500
Subject: [PATCH] Add HSA_CU_MASK

New environment variable HSA_CU_MASK allows users to
specify a cu mask to every queue allocated from any
GPU.  hsa_amd_queue_cu_set_mask is restricted from
escaping this mask.

A new API hsa_amd_queue_cu_get_mask is added to query
the current cu mask.

Change-Id: I846c03a5faaca9b95067c31db84b59cc9fce2f03


[ROCm/ROCR-Runtime commit: 4455250be1c7a4f5d8c87127b92375a4253e1738]
---
 .../runtime/hsa-runtime/CMakeLists.txt        |   1 +
 .../core/common/hsa_table_interface.cpp       |   6 +
 .../hsa-runtime/core/inc/amd_aql_queue.h      |  17 +-
 .../hsa-runtime/core/inc/amd_gpu_agent.h      |   9 +-
 .../runtime/hsa-runtime/core/inc/host_queue.h |   8 +-
 .../hsa-runtime/core/inc/hsa_ext_amd_impl.h   |   4 +
 .../hsa-runtime/core/inc/intercept_queue.h    |   5 +-
 .../runtime/hsa-runtime/core/inc/queue.h      |  12 +-
 .../core/runtime/amd_aql_queue.cpp            |  76 ++++++-
 .../core/runtime/amd_gpu_agent.cpp            |   4 +-
 .../hsa-runtime/core/runtime/amd_topology.cpp |   6 +-
 .../runtime/hsa-runtime/core/runtime/hsa.cpp  |   6 +
 .../core/runtime/hsa_api_trace.cpp            |   1 +
 .../hsa-runtime/core/runtime/hsa_ext_amd.cpp  |  21 +-
 .../runtime/hsa-runtime/core/util/flag.cpp    | 204 ++++++++++++++++++
 .../runtime/hsa-runtime/core/util/flag.h      |  19 +-
 .../runtime/hsa-runtime/hsacore.so.def        |   1 +
 .../runtime/hsa-runtime/inc/hsa_api_trace.h   |   1 +
 .../runtime/hsa-runtime/inc/hsa_ext_amd.h     |  59 ++++-
 19 files changed, 430 insertions(+), 30 deletions(-)
 create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp

diff --git a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt
index ec0ccdf748..2a8937ce58 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt
+++ b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt
@@ -151,6 +151,7 @@ set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY LINK_FLAGS ${HSA_SHARED_LINK
 set ( SRCS core/util/lnx/os_linux.cpp
            core/util/small_heap.cpp
            core/util/timer.cpp
+           core/util/flag.cpp
            core/runtime/amd_blit_kernel.cpp
            core/runtime/amd_blit_sdma.cpp
            core/runtime/amd_cpu_agent.cpp
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
index 0ff7b5f8b9..3159af3b4f 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
@@ -926,6 +926,12 @@ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                      queue, num_cu_mask_count, cu_mask);
 }
 
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask) {
+  return amdExtTable->hsa_amd_queue_cu_get_mask_fn(queue, num_cu_mask_count, cu_mask);
+}
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API
     hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h
index 2ac6b3503d..0567731225 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h
@@ -185,7 +185,16 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   /// @param cu_mask pointer to cu mask
   ///
   /// @return hsa_status_t
-  hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override;
+  hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override;
+
+  /// @brief Get CU Masking
+  ///
+  /// @param num_cu_mask_count size of mask bit array
+  ///
+  /// @param cu_mask pointer to cu mask
+  ///
+  /// @return hsa_status_t
+  hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override;
 
   // @brief Submits a block of PM4 and waits until it has been executed.
   void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;
@@ -277,6 +286,12 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   // Exception notification signal
   Signal* exception_signal_;
 
+  // CU mask lock
+  KernelMutex mask_lock_;
+
+  // Current CU mask
+  std::vector<uint32_t> cu_mask_;
+
   // Shared event used for queue errors
   static HsaEvent* queue_event_;
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 152a0e8df6..71b1d3fa44 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -174,7 +174,7 @@ class GpuAgent : public GpuAgentInt {
   // id.
   // @param [in] node_props Node property.
   // @param [in] xnack_mode XNACK mode of device.
-  GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode);
+  GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);
 
   // @brief GPU agent destructor.
   ~GpuAgent();
@@ -322,6 +322,10 @@ class GpuAgent : public GpuAgentInt {
     return memory_max_frequency_;
   }
 
+  // @brief Order the device is surfaced in hsa_iterate_agents counting only
+  // GPU devices.
+  __forceinline uint32_t enumeration_index() const { return enum_index_; }
+
   void Trim() override;
 
  protected:
@@ -454,6 +458,9 @@ class GpuAgent : public GpuAgentInt {
   // @brief The GPU memory maximum frequency in MHz.
   uint32_t memory_max_frequency_;
 
+  // @brief Enumeration index
+  uint32_t enum_index_;
+
   // @brief HDP flush registers
   hsa_amd_hdp_flush_t HDP_flush_ = {nullptr, nullptr};
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h
index 3393ede73a..8521aed7b7 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h
@@ -144,8 +144,12 @@ class HostQueue : public Queue {
                        std::memory_order_release);
   }
 
-  hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
-    return HSA_STATUS_ERROR;
+  hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
+  hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
   }
 
   void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
index 9954b8fc7a..9ea1b57e55 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
@@ -113,6 +113,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
                                                const uint32_t* cu_mask);
 
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
+
 // Mirrors Amd Extension Apis
 hsa_status_t
     hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h
index 4ab897fae9..209ee95405 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h
@@ -114,9 +114,12 @@ class QueueWrapper : public Queue {
   uint64_t AddWriteIndexRelease(uint64_t value) override {
     return wrapped->AddWriteIndexRelease(value);
   }
-  hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
+  hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
     return wrapped->SetCUMasking(num_cu_mask_count, cu_mask);
   }
+  hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
+    return wrapped->GetCUMasking(num_cu_mask_count, cu_mask);
+  }
   void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
     wrapped->ExecutePM4(cmd_data, cmd_size_b);
   }
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
index 9553a170c9..71866ff8f0 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
@@ -295,8 +295,16 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
   /// @param cu_mask pointer to cu mask
   ///
   /// @return hsa_status_t
-  virtual hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count,
-                                    const uint32_t* cu_mask) = 0;
+  virtual hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) = 0;
+
+  /// @brief Get CU Masking
+  ///
+  /// @param num_cu_mask_count size of mask bit array
+  ///
+  /// @param cu_mask pointer to cu mask
+  ///
+  /// @return hsa_status_t
+  virtual hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) = 0;
 
   // @brief Submits a block of PM4 and waits until it has been executed.
   virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) = 0;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index 19d5cc622d..8e0e602e13 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -304,6 +304,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
     exceptionState = ERROR_HANDLER_DONE;
   }
 
+  // Allocate IB for icache flushes.
   pm4_ib_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
       pm4_ib_size_b_, 0x1000, core::MemoryRegion::AllocateExecutable);
   if (pm4_ib_buf_ == nullptr)
@@ -313,6 +314,9 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
     core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
   });
 
+  // Set initial CU mask
+  SetCUMasking(0, nullptr);
+
   active_ = true;
 
   PM4IBGuard.Dismiss();
@@ -1027,12 +1031,72 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
   return false;
 }
 
-hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count,
-                                    const uint32_t* cu_mask) {
-  HSAKMT_STATUS ret = hsaKmtSetQueueCUMask(
-      queue_id_, num_cu_mask_count,
-      reinterpret_cast<HSAuint32*>(const_cast<uint32_t*>(cu_mask)));
-  return (HSAKMT_STATUS_SUCCESS == ret) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
+hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) {
+  uint32_t cu_count;
+  agent_->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  size_t mask_dwords = (cu_count + 31) / 32;
+  // Mask to trim the last uint32_t in cu_mask to the physical CU count
+  uint32_t tail_mask = (1 << (cu_count % 32)) - 1;
+
+  auto global_mask = core::Runtime::runtime_singleton_->flag().cu_mask(agent_->enumeration_index());
+  std::vector<uint32_t> mask;
+
+  bool clipped = false;
+
+  // num_cu_mask_count = 0 resets the CU mask.
+  if (num_cu_mask_count == 0) {
+    for (int i = 0; i < mask_dwords; i++) mask.push_back(-1);
+  } else {
+    for (int i = 0; i < num_cu_mask_count / 32; i++) mask.push_back(cu_mask[i]);
+  }
+
+  // Apply global mask to user mask
+  if (!global_mask.empty()) {
+    // Limit mask processing to smallest needed dword range
+    size_t limit = Min(global_mask.size(), mask.size(), mask_dwords);
+
+    // Check for disabling requested cus.
+    for (int i = limit; i < mask.size(); i++) {
+      if (mask[i] != 0) {
+        clipped = true;
+        break;
+      }
+    }
+
+    mask.resize(limit, 0);
+    for (size_t i = 0; i < limit; i++) {
+      clipped |= ((mask[i] & (~global_mask[i])) != 0);
+      mask[i] &= global_mask[i];
+    }
+  } else {
+    // Limit to physical CU range only
+    size_t limit = Min(mask.size(), mask_dwords);
+    mask.resize(limit, 0);
+  }
+
+  // Clip last dword to physical CU limit if necessary
+  if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;
+
+  // Apply mask and update current cu masking tracking.
+  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  HSAKMT_STATUS ret =
+      hsaKmtSetQueueCUMask(queue_id_, mask.size() * 32, reinterpret_cast<HSAuint32*>(&mask[0]));
+  if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR;
+  cu_mask_ = std::move(mask);
+  return clipped ? (hsa_status_t)HSA_STATUS_CU_MASK_REDUCED : HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
+  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  assert(!cu_mask_.empty() && "No current cu_mask!");
+
+  uint32_t user_dword_count = num_cu_mask_count / 32;
+  if (user_dword_count > cu_mask_.size()) {
+    memset(&cu_mask[cu_mask_.size()], 0, sizeof(uint32_t) * (user_dword_count - cu_mask_.size()));
+    user_dword_count = cu_mask_.size();
+  }
+  memcpy(cu_mask, &cu_mask_[0], sizeof(uint32_t) * user_dword_count);
+  return HSA_STATUS_SUCCESS;
 }
 
 void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 84cf857e61..da7932d951 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -77,7 +77,8 @@ extern HsaApiTable hsa_internal_api_table_;
 } // namespace core
 
 namespace AMD {
-GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode)
+GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode,
+                   uint32_t index)
     : GpuAgentInt(node),
       properties_(node_props),
       current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
@@ -89,6 +90,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
       doorbell_queue_map_(NULL),
       memory_bus_width_(0),
       memory_max_frequency_(0),
+      enum_index_(index),
       ape1_base_(0),
       ape1_size_(0),
       scratch_cache_(
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp
index bd8e9362bd..001b58feb0 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp
@@ -121,7 +121,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
       return nullptr;
   }
   try {
-    gpu = new GpuAgent(node_id, node_prop, xnack_mode);
+    gpu = new GpuAgent(node_id, node_prop, xnack_mode,
+                       core::Runtime::runtime_singleton_->gpu_agents().size());
 
     const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;
 
@@ -146,7 +147,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
       if (gpu->isa()->GetProcessorName() == "gfx908") {
         node_prop.Capability.ui32.SRAM_EDCSupport = 1;
         delete gpu;
-        gpu = new GpuAgent(node_id, node_prop, xnack_mode);
+        gpu = new GpuAgent(node_id, node_prop, xnack_mode,
+                           core::Runtime::runtime_singleton_->gpu_agents().size());
       }
     }
   } catch (const hsa_exception& e) {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp
index b2d1f6537c..cc2c8f7841 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp
@@ -2762,6 +2762,12 @@ hsa_status_t hsa_status_string(
       *status_string =
           "HSA_STATUS_ERROR_MEMORY_FAULT: Agent attempted to access an inaccessible address.";
       break;
+    case HSA_STATUS_CU_MASK_REDUCED:
+      *status_string =
+          "HSA_STATUS_CU_MASK_REDUCED: The CU mask was successfully set but the mask attempted to "
+          "enable a CU which was disabled for the process.  CUs disabled for the process remain "
+          "disabled.";
+      break;
     default:
       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
index 605ec15aec..ab6a1f31b9 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
@@ -359,6 +359,7 @@ void HsaApiTable::UpdateAmdExts() {
   amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function;
   amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any;
   amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask;
+  amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask;
   amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info;
   amd_ext_api.hsa_amd_agent_iterate_memory_pools_fn = AMD::hsa_amd_agent_iterate_memory_pools;
   amd_ext_api.hsa_amd_memory_pool_allocate_fn = AMD::hsa_amd_memory_pool_allocate;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
index 66d7ff7278..843a588ff0 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
@@ -535,19 +535,34 @@ hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) {
   CATCH;
 }
 
-hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
-                                               uint32_t num_cu_mask_count,
-                                               const uint32_t* cu_mask) {
+hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                       const uint32_t* cu_mask) {
   TRY;
   IS_OPEN();
   IS_BAD_PTR(cu_mask);
 
   core::Queue* cmd_queue = core::Queue::Convert(queue);
   IS_VALID(cmd_queue);
+  if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0))
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
   CATCH;
 }
 
+hsa_status_t hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                       uint32_t* cu_mask) {
+  TRY;
+  IS_OPEN();
+  IS_BAD_PTR(cu_mask);
+
+  core::Queue* cmd_queue = core::Queue::Convert(queue);
+  IS_VALID(cmd_queue);
+  if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0))
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  return cmd_queue->GetCUMasking(num_cu_mask_count, cu_mask);
+  CATCH;
+}
+
 hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
                                  hsa_agent_t* agents, int num_agent,
                                  void** agent_ptr) {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp
new file mode 100644
index 0000000000..8a8c6e6e00
--- /dev/null
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.cpp
@@ -0,0 +1,204 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2021-2021, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "core/util/flag.h"
+#include "core/util/utils.h"
+
+#include <vector>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <locale>
+
+namespace rocr {
+
+// split at separators
+static std::vector<std::string> split(std::string& str, char sep) {
+  std::vector<std::string> ret;
+  while (!str.empty()) {
+    size_t pos = str.find(sep);
+    if (pos == std::string::npos) {
+      ret.push_back(str);
+      return ret;
+    }
+    ret.push_back(str.substr(0, pos));
+    str.erase(0, pos + 1);
+  }
+  return ret;
+};
+
+// Parse id,id-id,... strings into id lists
+static std::vector<uint32_t> get_elements(std::string& str) {
+  std::vector<uint32_t> ret;
+  MAKE_NAMED_SCOPE_GUARD(error, [&]() { ret.clear(); });
+
+  std::vector<std::string> ranges = split(str, ',');
+  for (auto& str : ranges) {
+    auto range = split(str, '-');
+    // failure, too many -'s.
+    if (range.size() > 2) return ret;
+
+    char* end;
+    uint32_t index = strtoul(range[0].c_str(), &end, 10);
+    // Invalid syntax - id's must be base 10 digits only.
+    if (*end != '\0') return ret;
+    ret.push_back(index);
+
+    if (range.size() == 2) {
+      uint32_t secondindex = strtoul(range[1].c_str(), &end, 10);
+      if (*end != '\0') return ret;         // bad syntax
+      if (secondindex < index) return ret;  // inverted range
+      for (uint32_t i = index + 1; i < secondindex + 1; i++) ret.push_back(i);
+    }
+  }
+
+  // Confirm no duplicate ids.
+  std::sort(ret.begin(), ret.end());
+  if (std::adjacent_find(ret.begin(), ret.end()) != ret.end()) return ret;
+
+  // Good parse, keep result.
+  error.Dismiss();
+  return ret;
+};
+
+/*
+Parse env var per the following syntax, all whitespace is ignored:
+
+ID = [0-9][0-9]*                         ex. base 10 numbers
+ID_list = (ID | ID-ID)[, (ID | ID-ID)]*  ex. 0,2-4,7
+GPU_list = ID_list                       ex. 0,2-4,7
+CU_list = 0x[0-F]* | ID_list             ex. 0x337F OR 0,2-4,7
+CU_Set = GPU_list : CU_list              ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
+HSA_CU_MASK =  CU_Set [; CU_Set]*        ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
+
+GPU indexes are taken post ROCM_VISIBLE_DEVICES reordering.
+Listed or bit set CUs will be enabled at queue creation on the associated GPU.
+All other CUs on the associated GPUs will be disabled.
+CU masks of unlisted GPUs are not restricted.
+
+Repeating a GPU or CU ID is a syntax error.
+Parsing stops at the first CU_Set that has a syntax error, that set and all
+following sets are ignored.
+Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error.
+Users should use ROCM_VISIBLE_DEVICES if they want to exclude use of a
+particular GPU.
+*/
+void Flag::parse_masks(std::string& var) {
+  if (var.empty()) return;
+
+  // Remove whitespace
+  auto end = std::remove_if(var.begin(), var.end(),
+                            [](char c) { return std::isspace<char>(c, std::locale::classic()); });
+  var.erase(end, var.end());
+
+  // Switch to uppercase
+  for (auto& c : var) c = toupper(c);
+
+  // Iterate over cu sets
+  auto sets = split(var, ';');
+  for (auto& set : sets) {
+    auto parts = split(set, ':');
+    if (parts.size() != 2) return;
+
+    // temp storage for cu_set parsing.
+    std::vector<uint32_t> gpu_index;
+    std::vector<uint32_t> mask;
+
+    // parse cu list first, check for bitmask format
+    if (parts[1][1] == 'X') {
+      // Confirm hex format and strip prefix
+      auto& cu = parts[1];
+      if (cu[0] != '0') return;
+      cu.erase(0, 2);
+
+      // Ensure all valid hex characters
+      for (auto& c : cu) {
+        if (!isxdigit(c)) return;
+      }
+
+      // Convert to uint32_t, lsb first.
+      size_t len = cu.length();
+      while (len != 0) {
+        size_t trim = Min(len, size_t(8));
+        len -= trim;
+        auto tmp = cu.substr(len, trim);
+        auto chunk = stoul(tmp, nullptr, 16);
+        mask.push_back(chunk);
+      }
+
+      // Trim leading zeros
+      while (!mask.empty() && mask.back() == 0) mask.pop_back();
+
+      // Mask 0x0 is an error.
+      if (mask.empty()) return;
+
+    } else {
+      // parse cu lists
+      auto cu_indices = get_elements(parts[1]);
+      if (cu_indices.empty()) return;
+      uint32_t maxdword = cu_indices.back() / 32 + 1;
+      mask.resize(maxdword, 0);
+      for (auto id : cu_indices) {
+        uint32_t index, offset;
+        index = id / 32;
+        offset = id % 32;
+        mask[index] |= 1ul << offset;
+      }
+    }
+
+    // parse device list
+    gpu_index = get_elements(parts[0]);
+    if (gpu_index.empty()) return;
+
+    // Ensure that no GPU was repeated across cu_sets
+    for (auto id : gpu_index) {
+      if (cu_mask_.find(id) != cu_mask_.end()) return;
+    }
+
+    // Insert into map
+    for (auto id : gpu_index) {
+      cu_mask_[id] = mask;
+    }
+  }
+}
+
+}  // namespace rocr
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h
index c7149363ad..b8d996c1b8 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h
@@ -3,7 +3,7 @@
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
 //
-// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved.
 //
 // Developed by:
 //
@@ -45,6 +45,8 @@
 
 #include <stdint.h>
 
+#include <vector>
+#include <map>
 #include <string>
 
 #include "core/util/os.h"
@@ -149,6 +151,9 @@ class Flag {
 
     var = os::GetEnvVar("HSA_ENABLE_DEBUG");
     debug_ = (var == "1") ? true : false;
+
+    var = os::GetEnvVar("HSA_CU_MASK");
+    parse_masks(var);
   }
 
   bool check_flat_scratch() const { return check_flat_scratch_; }
@@ -206,6 +211,13 @@ class Flag {
 
   bool debug() const { return debug_; }
 
+  const std::vector<uint32_t>& cu_mask(uint32_t gpu_index) const {
+    static const std::vector<uint32_t> empty;
+    auto it = cu_mask_.find(gpu_index);
+    if (it == cu_mask_.end()) return empty;
+    return it->second;
+  }
+
  private:
   bool check_flat_scratch_;
   bool enable_vm_fault_message_;
@@ -243,6 +255,11 @@ class Flag {
   // Indicates user preference for Xnack state.
   XNACK_REQUEST xnack_;
 
+  // Map GPU index post RVD to its default cu mask.
+  std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
+
+  void parse_masks(std::string& args);
+
   DISALLOW_COPY_AND_ASSIGN(Flag);
 };
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def
index eb853410e5..16aa30b3e3 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def
+++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def
@@ -179,6 +179,7 @@ global:
 	hsa_amd_async_function;
 	hsa_amd_image_get_info_max_dim;
 	hsa_amd_queue_cu_set_mask;
+	hsa_amd_queue_cu_get_mask;
 	hsa_amd_memory_fill;
 	hsa_amd_memory_async_copy;
 	hsa_amd_memory_async_copy_rect;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h
index 35dd21bfa5..451204f412 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h
@@ -186,6 +186,7 @@ struct AmdExtTable {
   decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
   decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
   decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
+  decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn;
 };
 
 // Table to export HSA Core Runtime Apis
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h
index b1ba20f631..7c1fc16af2 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h
@@ -191,6 +191,13 @@ enum {
    * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
    */
   HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+  /**
+   * The CU mask was successfully set but the mask attempted to enable a CU
+   * which was disabled for the process.  CUs disabled for the process remain
+   * disabled.
+   */
+  HSA_STATUS_CU_MASK_REDUCED = 44,
 };
 
 /**
@@ -780,31 +787,63 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
                                                     void* value);
 
 /**
- * @brief Set a CU affinity to specific queues within the process, this function
- * call is "atomic".
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs.  The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
  *
  * @param[in] queue A pointer to HSA queue.
  *
- * @param[in] num_cu_mask_count Size of CUMask bit array passed in.
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
  *
  * @param[in] cu_mask Bit-vector representing the CU mask.
  *
  * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
  *
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK.  CUs disabled by HSA_CU_MASK remain disabled.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+                                               uint32_t num_cu_mask_count,
+                                               const uint32_t* cu_mask);
+
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
  * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
  * initialized.
  *
  * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
  *
- * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
- * multiple of 32 or @p cu_mask is NULL.
- *
- * @retval ::HSA_STATUS_ERROR failed to call thunk api
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
  *
  */
-hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
-                                               uint32_t num_cu_mask_count,
-                                               const uint32_t* cu_mask);
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
 
 /**
  * @brief Memory segments associated with a memory pool.