Add HSA_CU_MASK

New environment variable HSA_CU_MASK allows users to specify a cu mask to every queue allocated from any GPU. hsa_amd_queue_cu_set_mask is restricted from escaping this mask. A new API hsa_amd_queue_cu_get_mask is added to query the current cu mask. Change-Id: I846c03a5faaca9b95067c31db84b59cc9fce2f03 [ROCm/ROCR-Runtime commit: 4455250be1]
2021-06-29 18:03:05 -05:00
parent c7606d1dfc
commit 4c6ea88cf5
19 changed files with 430 additions and 30 deletions
@@ -151,6 +151,7 @@ set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY LINK_FLAGS ${HSA_SHARED_LINK
 set ( SRCS core/util/lnx/os_linux.cpp
           core/util/small_heap.cpp
           core/util/timer.cpp
+           core/util/flag.cpp
           core/runtime/amd_blit_kernel.cpp
           core/runtime/amd_blit_sdma.cpp
           core/runtime/amd_cpu_agent.cpp
@@ -926,6 +926,12 @@ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                     queue, num_cu_mask_count, cu_mask);
 }

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask) {
+  return amdExtTable->hsa_amd_queue_cu_get_mask_fn(queue, num_cu_mask_count, cu_mask);
+}
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API
    hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
@@ -185,7 +185,16 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  /// @param cu_mask pointer to cu mask
  ///
  /// @return hsa_status_t
-  hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override;
+  hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override;
+
+  /// @brief Get CU Masking
+  ///
+  /// @param num_cu_mask_count size of mask bit array
+  ///
+  /// @param cu_mask pointer to cu mask
+  ///
+  /// @return hsa_status_t
+  hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override;

  // @brief Submits a block of PM4 and waits until it has been executed.
  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;
@@ -277,6 +286,12 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  // Exception notification signal
  Signal* exception_signal_;

+  // CU mask lock
+  KernelMutex mask_lock_;
+
+  // Current CU mask
+  std::vector<uint32_t> cu_mask_;
+
  // Shared event used for queue errors
  static HsaEvent* queue_event_;

@@ -174,7 +174,7 @@ class GpuAgent : public GpuAgentInt {
  // id.
  // @param [in] node_props Node property.
  // @param [in] xnack_mode XNACK mode of device.
-  GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode);
+  GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);

  // @brief GPU agent destructor.
  ~GpuAgent();
@@ -322,6 +322,10 @@ class GpuAgent : public GpuAgentInt {
    return memory_max_frequency_;
  }

+  // @brief Order the device is surfaced in hsa_iterate_agents counting only
+  // GPU devices.
+  __forceinline uint32_t enumeration_index() const { return enum_index_; }
+
  void Trim() override;

 protected:
@@ -454,6 +458,9 @@ class GpuAgent : public GpuAgentInt {
  // @brief The GPU memory maximum frequency in MHz.
  uint32_t memory_max_frequency_;

+  // @brief Enumeration index
+  uint32_t enum_index_;
+
  // @brief HDP flush registers
  hsa_amd_hdp_flush_t HDP_flush_ = {nullptr, nullptr};

@@ -144,8 +144,12 @@ class HostQueue : public Queue {
                       std::memory_order_release);
  }

-  hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
-    return HSA_STATUS_ERROR;
+  hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
+  hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
  }

  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
@@ -113,6 +113,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                               uint32_t num_cu_mask_count,
                                               const uint32_t* cu_mask);

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
+
 // Mirrors Amd Extension Apis
 hsa_status_t
    hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
@@ -114,9 +114,12 @@ class QueueWrapper : public Queue {
  uint64_t AddWriteIndexRelease(uint64_t value) override {
    return wrapped->AddWriteIndexRelease(value);
  }
-  hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
+  hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
    return wrapped->SetCUMasking(num_cu_mask_count, cu_mask);
  }
+  hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
+    return wrapped->GetCUMasking(num_cu_mask_count, cu_mask);
+  }
  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
    wrapped->ExecutePM4(cmd_data, cmd_size_b);
  }
@@ -295,8 +295,16 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
  /// @param cu_mask pointer to cu mask
  ///
  /// @return hsa_status_t
-  virtual hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count,
-                                    const uint32_t* cu_mask) = 0;
+  virtual hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) = 0;
+
+  /// @brief Get CU Masking
+  ///
+  /// @param num_cu_mask_count size of mask bit array
+  ///
+  /// @param cu_mask pointer to cu mask
+  ///
+  /// @return hsa_status_t
+  virtual hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) = 0;

  // @brief Submits a block of PM4 and waits until it has been executed.
  virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) = 0;
@@ -304,6 +304,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
    exceptionState = ERROR_HANDLER_DONE;
  }

+  // Allocate IB for icache flushes.
  pm4_ib_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
      pm4_ib_size_b_, 0x1000, core::MemoryRegion::AllocateExecutable);
  if (pm4_ib_buf_ == nullptr)
@@ -313,6 +314,9 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
    core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
  });

+  // Set initial CU mask
+  SetCUMasking(0, nullptr);
+
  active_ = true;

  PM4IBGuard.Dismiss();
@@ -1027,12 +1031,72 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
  return false;
 }

-hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count,
-                                    const uint32_t* cu_mask) {
-  HSAKMT_STATUS ret = hsaKmtSetQueueCUMask(
-      queue_id_, num_cu_mask_count,
-      reinterpret_cast<HSAuint32*>(const_cast<uint32_t*>(cu_mask)));
-  return (HSAKMT_STATUS_SUCCESS == ret) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
+hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) {
+  uint32_t cu_count;
+  agent_->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  size_t mask_dwords = (cu_count + 31) / 32;
+  // Mask to trim the last uint32_t in cu_mask to the physical CU count
+  uint32_t tail_mask = (1 << (cu_count % 32)) - 1;
+
+  auto global_mask = core::Runtime::runtime_singleton_->flag().cu_mask(agent_->enumeration_index());
+  std::vector<uint32_t> mask;
+
+  bool clipped = false;
+
+  // num_cu_mask_count = 0 resets the CU mask.
+  if (num_cu_mask_count == 0) {
+    for (int i = 0; i < mask_dwords; i++) mask.push_back(-1);
+  } else {
+    for (int i = 0; i < num_cu_mask_count / 32; i++) mask.push_back(cu_mask[i]);
+  }
+
+  // Apply global mask to user mask
+  if (!global_mask.empty()) {
+    // Limit mask processing to smallest needed dword range
+    size_t limit = Min(global_mask.size(), mask.size(), mask_dwords);
+
+    // Check for disabling requested cus.
+    for (int i = limit; i < mask.size(); i++) {
+      if (mask[i] != 0) {
+        clipped = true;
+        break;
+      }
+    }
+
+    mask.resize(limit, 0);
+    for (size_t i = 0; i < limit; i++) {
+      clipped |= ((mask[i] & (~global_mask[i])) != 0);
+      mask[i] &= global_mask[i];
+    }
+  } else {
+    // Limit to physical CU range only
+    size_t limit = Min(mask.size(), mask_dwords);
+    mask.resize(limit, 0);
+  }
+
+  // Clip last dword to physical CU limit if necessary
+  if ((mask.size() == mask_dwords) && (tail_mask != 0)) mask[mask_dwords - 1] &= tail_mask;
+
+  // Apply mask and update current cu masking tracking.
+  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  HSAKMT_STATUS ret =
+      hsaKmtSetQueueCUMask(queue_id_, mask.size() * 32, reinterpret_cast<HSAuint32*>(&mask[0]));
+  if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR;
+  cu_mask_ = std::move(mask);
+  return clipped ? (hsa_status_t)HSA_STATUS_CU_MASK_REDUCED : HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t AqlQueue::GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) {
+  ScopedAcquire<KernelMutex> lock(&mask_lock_);
+  assert(!cu_mask_.empty() && "No current cu_mask!");
+
+  uint32_t user_dword_count = num_cu_mask_count / 32;
+  if (user_dword_count > cu_mask_.size()) {
+    memset(&cu_mask[cu_mask_.size()], 0, sizeof(uint32_t) * (user_dword_count - cu_mask_.size()));
+    user_dword_count = cu_mask_.size();
+  }
+  memcpy(cu_mask, &cu_mask_[0], sizeof(uint32_t) * user_dword_count);
+  return HSA_STATUS_SUCCESS;
 }

 void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
@@ -77,7 +77,8 @@ extern HsaApiTable hsa_internal_api_table_;
 } // namespace core

 namespace AMD {
-GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode)
+GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode,
+                   uint32_t index)
    : GpuAgentInt(node),
      properties_(node_props),
      current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
@@ -89,6 +90,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
      doorbell_queue_map_(NULL),
      memory_bus_width_(0),
      memory_max_frequency_(0),
+      enum_index_(index),
      ape1_base_(0),
      ape1_size_(0),
      scratch_cache_(
@@ -121,7 +121,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
      return nullptr;
  }
  try {
-    gpu = new GpuAgent(node_id, node_prop, xnack_mode);
+    gpu = new GpuAgent(node_id, node_prop, xnack_mode,
+                       core::Runtime::runtime_singleton_->gpu_agents().size());

    const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;

@@ -146,7 +147,8 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
      if (gpu->isa()->GetProcessorName() == "gfx908") {
        node_prop.Capability.ui32.SRAM_EDCSupport = 1;
        delete gpu;
-        gpu = new GpuAgent(node_id, node_prop, xnack_mode);
+        gpu = new GpuAgent(node_id, node_prop, xnack_mode,
+                           core::Runtime::runtime_singleton_->gpu_agents().size());
      }
    }
  } catch (const hsa_exception& e) {
@@ -2762,6 +2762,12 @@ hsa_status_t hsa_status_string(
      *status_string =
          "HSA_STATUS_ERROR_MEMORY_FAULT: Agent attempted to access an inaccessible address.";
      break;
+    case HSA_STATUS_CU_MASK_REDUCED:
+      *status_string =
+          "HSA_STATUS_CU_MASK_REDUCED: The CU mask was successfully set but the mask attempted to "
+          "enable a CU which was disabled for the process.  CUs disabled for the process remain "
+          "disabled.";
+      break;
    default:
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
@@ -359,6 +359,7 @@ void HsaApiTable::UpdateAmdExts() {
  amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function;
  amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any;
  amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask;
+  amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask;
  amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info;
  amd_ext_api.hsa_amd_agent_iterate_memory_pools_fn = AMD::hsa_amd_agent_iterate_memory_pools;
  amd_ext_api.hsa_amd_memory_pool_allocate_fn = AMD::hsa_amd_memory_pool_allocate;
@@ -535,19 +535,34 @@ hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) {
  CATCH;
 }

-hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
-                                               uint32_t num_cu_mask_count,
-                                               const uint32_t* cu_mask) {
+hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                       const uint32_t* cu_mask) {
  TRY;
  IS_OPEN();
  IS_BAD_PTR(cu_mask);

  core::Queue* cmd_queue = core::Queue::Convert(queue);
  IS_VALID(cmd_queue);
+  if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0))
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
  CATCH;
 }

+hsa_status_t hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                       uint32_t* cu_mask) {
+  TRY;
+  IS_OPEN();
+  IS_BAD_PTR(cu_mask);
+
+  core::Queue* cmd_queue = core::Queue::Convert(queue);
+  IS_VALID(cmd_queue);
+  if ((num_cu_mask_count == 0) || (num_cu_mask_count % 32 != 0))
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  return cmd_queue->GetCUMasking(num_cu_mask_count, cu_mask);
+  CATCH;
+}
+
 hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
                                 hsa_agent_t* agents, int num_agent,
                                 void** agent_ptr) {
@@ -0,0 +1,204 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2021-2021, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "core/util/flag.h"
+#include "core/util/utils.h"
+
+#include <vector>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <locale>
+
+namespace rocr {
+
+// split at separators
+static std::vector<std::string> split(std::string& str, char sep) {
+  std::vector<std::string> ret;
+  while (!str.empty()) {
+    size_t pos = str.find(sep);
+    if (pos == std::string::npos) {
+      ret.push_back(str);
+      return ret;
+    }
+    ret.push_back(str.substr(0, pos));
+    str.erase(0, pos + 1);
+  }
+  return ret;
+};
+
+// Parse id,id-id,... strings into id lists
+static std::vector<uint32_t> get_elements(std::string& str) {
+  std::vector<uint32_t> ret;
+  MAKE_NAMED_SCOPE_GUARD(error, [&]() { ret.clear(); });
+
+  std::vector<std::string> ranges = split(str, ',');
+  for (auto& str : ranges) {
+    auto range = split(str, '-');
+    // failure, too many -'s.
+    if (range.size() > 2) return ret;
+
+    char* end;
+    uint32_t index = strtoul(range[0].c_str(), &end, 10);
+    // Invalid syntax - id's must be base 10 digits only.
+    if (*end != '\0') return ret;
+    ret.push_back(index);
+
+    if (range.size() == 2) {
+      uint32_t secondindex = strtoul(range[1].c_str(), &end, 10);
+      if (*end != '\0') return ret;         // bad syntax
+      if (secondindex < index) return ret;  // inverted range
+      for (uint32_t i = index + 1; i < secondindex + 1; i++) ret.push_back(i);
+    }
+  }
+
+  // Confirm no duplicate ids.
+  std::sort(ret.begin(), ret.end());
+  if (std::adjacent_find(ret.begin(), ret.end()) != ret.end()) return ret;
+
+  // Good parse, keep result.
+  error.Dismiss();
+  return ret;
+};
+
+/*
+Parse env var per the following syntax, all whitespace is ignored:
+
+ID = [0-9][0-9]*                         ex. base 10 numbers
+ID_list = (ID | ID-ID)[, (ID | ID-ID)]*  ex. 0,2-4,7
+GPU_list = ID_list                       ex. 0,2-4,7
+CU_list = 0x[0-F]* | ID_list             ex. 0x337F OR 0,2-4,7
+CU_Set = GPU_list : CU_list              ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
+HSA_CU_MASK =  CU_Set [; CU_Set]*        ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
+
+GPU indexes are taken post ROCM_VISIBLE_DEVICES reordering.
+Listed or bit set CUs will be enabled at queue creation on the associated GPU.
+All other CUs on the associated GPUs will be disabled.
+CU masks of unlisted GPUs are not restricted.
+
+Repeating a GPU or CU ID is a syntax error.
+Parsing stops at the first CU_Set that has a syntax error, that set and all
+following sets are ignored.
+Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error.
+Users should use ROCM_VISIBLE_DEVICES if they want to exclude use of a
+particular GPU.
+*/
+void Flag::parse_masks(std::string& var) {
+  if (var.empty()) return;
+
+  // Remove whitespace
+  auto end = std::remove_if(var.begin(), var.end(),
+                            [](char c) { return std::isspace<char>(c, std::locale::classic()); });
+  var.erase(end, var.end());
+
+  // Switch to uppercase
+  for (auto& c : var) c = toupper(c);
+
+  // Iterate over cu sets
+  auto sets = split(var, ';');
+  for (auto& set : sets) {
+    auto parts = split(set, ':');
+    if (parts.size() != 2) return;
+
+    // temp storage for cu_set parsing.
+    std::vector<uint32_t> gpu_index;
+    std::vector<uint32_t> mask;
+
+    // parse cu list first, check for bitmask format
+    if (parts[1][1] == 'X') {
+      // Confirm hex format and strip prefix
+      auto& cu = parts[1];
+      if (cu[0] != '0') return;
+      cu.erase(0, 2);
+
+      // Ensure all valid hex characters
+      for (auto& c : cu) {
+        if (!isxdigit(c)) return;
+      }
+
+      // Convert to uint32_t, lsb first.
+      size_t len = cu.length();
+      while (len != 0) {
+        size_t trim = Min(len, size_t(8));
+        len -= trim;
+        auto tmp = cu.substr(len, trim);
+        auto chunk = stoul(tmp, nullptr, 16);
+        mask.push_back(chunk);
+      }
+
+      // Trim leading zeros
+      while (!mask.empty() && mask.back() == 0) mask.pop_back();
+
+      // Mask 0x0 is an error.
+      if (mask.empty()) return;
+
+    } else {
+      // parse cu lists
+      auto cu_indices = get_elements(parts[1]);
+      if (cu_indices.empty()) return;
+      uint32_t maxdword = cu_indices.back() / 32 + 1;
+      mask.resize(maxdword, 0);
+      for (auto id : cu_indices) {
+        uint32_t index, offset;
+        index = id / 32;
+        offset = id % 32;
+        mask[index] |= 1ul << offset;
+      }
+    }
+
+    // parse device list
+    gpu_index = get_elements(parts[0]);
+    if (gpu_index.empty()) return;
+
+    // Ensure that no GPU was repeated across cu_sets
+    for (auto id : gpu_index) {
+      if (cu_mask_.find(id) != cu_mask_.end()) return;
+    }
+
+    // Insert into map
+    for (auto id : gpu_index) {
+      cu_mask_[id] = mask;
+    }
+  }
+}
+
+}  // namespace rocr
@@ -3,7 +3,7 @@
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
 //
-// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved.
 //
 // Developed by:
 //
@@ -45,6 +45,8 @@

 #include <stdint.h>

+#include <vector>
+#include <map>
 #include <string>

 #include "core/util/os.h"
@@ -149,6 +151,9 @@ class Flag {

    var = os::GetEnvVar("HSA_ENABLE_DEBUG");
    debug_ = (var == "1") ? true : false;
+
+    var = os::GetEnvVar("HSA_CU_MASK");
+    parse_masks(var);
  }

  bool check_flat_scratch() const { return check_flat_scratch_; }
@@ -206,6 +211,13 @@ class Flag {

  bool debug() const { return debug_; }

+  const std::vector<uint32_t>& cu_mask(uint32_t gpu_index) const {
+    static const std::vector<uint32_t> empty;
+    auto it = cu_mask_.find(gpu_index);
+    if (it == cu_mask_.end()) return empty;
+    return it->second;
+  }
+
 private:
  bool check_flat_scratch_;
  bool enable_vm_fault_message_;
@@ -243,6 +255,11 @@ class Flag {
  // Indicates user preference for Xnack state.
  XNACK_REQUEST xnack_;

+  // Map GPU index post RVD to its default cu mask.
+  std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
+
+  void parse_masks(std::string& args);
+
  DISALLOW_COPY_AND_ASSIGN(Flag);
 };

@@ -179,6 +179,7 @@ global:
 	hsa_amd_async_function;
 	hsa_amd_image_get_info_max_dim;
 	hsa_amd_queue_cu_set_mask;
+	hsa_amd_queue_cu_get_mask;
 	hsa_amd_memory_fill;
 	hsa_amd_memory_async_copy;
 	hsa_amd_memory_async_copy_rect;
@@ -186,6 +186,7 @@ struct AmdExtTable {
  decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
  decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
  decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
+  decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn;
 };

 // Table to export HSA Core Runtime Apis
@@ -191,6 +191,13 @@ enum {
   * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
   */
  HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+  /**
+   * The CU mask was successfully set but the mask attempted to enable a CU
+   * which was disabled for the process.  CUs disabled for the process remain
+   * disabled.
+   */
+  HSA_STATUS_CU_MASK_REDUCED = 44,
 };

 /**
@@ -780,31 +787,63 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
                                                    void* value);

 /**
- * @brief Set a CU affinity to specific queues within the process, this function
- * call is "atomic".
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs.  The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
 *
 * @param[in] queue A pointer to HSA queue.
 *
- * @param[in] num_cu_mask_count Size of CUMask bit array passed in.
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
 *
 * @param[in] cu_mask Bit-vector representing the CU mask.
 *
 * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
 *
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK.  CUs disabled by HSA_CU_MASK remain disabled.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+                                               uint32_t num_cu_mask_count,
+                                               const uint32_t* cu_mask);
+
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
 * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
 * initialized.
 *
 * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
 *
- * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
- * multiple of 32 or @p cu_mask is NULL.
- *
- * @retval ::HSA_STATUS_ERROR failed to call thunk api
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
 *
 */
-hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
-                                               uint32_t num_cu_mask_count,
-                                               const uint32_t* cu_mask);
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);

 /**
 * @brief Memory segments associated with a memory pool.