Squash merge of cfreehil/amd-temp-gfx90a onto amd-staging.

Includes some workarounds and HMM. Conflicts: opensrc/hsa-runtime/core/runtime/amd_topology.cpp opensrc/hsa-runtime/core/util/flag.h Change-Id: I22976f07964a43dbb228a6231777dbd599112b8d [ROCm/ROCR-Runtime commit: 7333c77e22]
2021-04-01 20:51:51 -05:00
@@ -186,19 +186,19 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent,

  hsa_queue_t *queue[kMaxQueue];  // command queue
  uint32_t ii;
+  test_validation_data user_data[kMaxQueue];
  for (ii = 0; ii < kMaxQueue; ++ii) {
-    test_validation_data user_data;
    // set callback flag to false if callback called then it will change to true
-    user_data.cb_triggered = false;
+    user_data[ii].cb_triggered = false;
    // set the queue pointer
-    user_data.queue_pointer = &queue[ii];
+    user_data[ii].queue_pointer = &queue[ii];
    // set the expected status in queue error calback handling
-    user_data.expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
+    user_data[ii].expected_status = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;

    // create queue
    err = hsa_queue_create(gpuAgent,
-                           queue_max, HSA_QUEUE_TYPE_SINGLE,
-                           CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
+                       queue_max, HSA_QUEUE_TYPE_SINGLE,
+                       CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
    ASSERT_EQ(err, HSA_STATUS_SUCCESS);


@@ -240,12 +240,12 @@ void QueueValidation::QueueValidationForInvalidDimension(hsa_agent_t cpuAgent,
    // completion signal should not be changed.
    ASSERT_EQ(completion, 1);

-    // queue error handling callback  should be triggered
-    ASSERT_EQ(user_data.cb_triggered, true);
-
    hsa_signal_store_relaxed(aql().completion_signal, 1);
  }
+  sleep(1);
  for (ii = 0; ii < kMaxQueue; ++ii) {
+    // queue error handling callback  should be triggered
+    ASSERT_EQ(user_data[ii].cb_triggered, true);
    if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
  }
 }
@@ -269,20 +269,21 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent,
  queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;

  hsa_queue_t *queue[kMaxQueue];  // command queue
+  test_validation_data user_data[kMaxQueue];
+
  uint32_t ii;
  for (ii = 0; ii < kMaxQueue; ++ii) {
-    test_validation_data user_data;
    // set callback flag to false if callback called then it will change to true
-    user_data.cb_triggered = false;
+    user_data[ii].cb_triggered = false;
    // set the queue pointer
-    user_data.queue_pointer = &queue[ii];
+    user_data[ii].queue_pointer = &queue[ii];
    // set the expected status in queue error calback handling
-    user_data.expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION;
+    user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_ALLOCATION;

    // create queue
    err = hsa_queue_create(gpuAgent,
-                           queue_max, HSA_QUEUE_TYPE_SINGLE,
-                           CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
+                       queue_max, HSA_QUEUE_TYPE_SINGLE,
+                       CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
    ASSERT_EQ(err, HSA_STATUS_SUCCESS);


@@ -325,12 +326,12 @@ void QueueValidation::QueueValidationInvalidGroupMemory(hsa_agent_t cpuAgent,
    // completion signal should not be changed.
    ASSERT_EQ(completion, 1);

-    // queue error handling callback  should be triggered
-    ASSERT_EQ(user_data.cb_triggered, true);
-
    hsa_signal_store_relaxed(aql().completion_signal, 1);
  }
+  sleep(1);
  for (ii = 0; ii < kMaxQueue; ++ii) {
+    // queue error handling callback  should be triggered
+    ASSERT_EQ(user_data[ii].cb_triggered, true);
    if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
  }
 }
@@ -353,20 +354,20 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent
  queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;

  hsa_queue_t *queue[kMaxQueue];  // command queue
+  test_validation_data user_data[kMaxQueue];
  uint32_t ii;
  for (ii = 0; ii < kMaxQueue; ++ii) {
-    test_validation_data user_data;
    // set callback flag to false if callback called then it will change to true
-    user_data.cb_triggered = false;
+    user_data[ii].cb_triggered = false;
    // set the queue pointer
-    user_data.queue_pointer = &queue[ii];
+    user_data[ii].queue_pointer = &queue[ii];
    // set the expected status in queue error calback handling
-    user_data.expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+    user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;

    // create queue
    err = hsa_queue_create(gpuAgent,
                           kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
-                           CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
+                           CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
    ASSERT_EQ(err, HSA_STATUS_SUCCESS);


@@ -408,12 +409,12 @@ void QueueValidation::QueueValidationForInvalidKernelObject(hsa_agent_t cpuAgent
    // completion signal should not be changed.
    ASSERT_EQ(completion, 1);

-    // queue error handling callback  should be triggered
-    ASSERT_EQ(user_data.cb_triggered, true);
-
    hsa_signal_store_relaxed(aql().completion_signal, 1);
  }
+  sleep(1);
  for (ii = 0; ii < kMaxQueue; ++ii) {
+    // queue error handling callback  should be triggered
+    ASSERT_EQ(user_data[ii].cb_triggered, true);
    if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
  }
 }
@@ -437,19 +438,19 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent,

  hsa_queue_t *queue[kMaxQueue];  // command queue
  uint32_t ii;
+  test_validation_data user_data[kMaxQueue];
  for (ii = 0; ii < kMaxQueue; ++ii) {
-    test_validation_data user_data;
    // set callback flag to false if callback called then it will change to true
-    user_data.cb_triggered = false;
+    user_data[ii].cb_triggered = false;
    // set the queue pointer
-    user_data.queue_pointer = &queue[ii];
+    user_data[ii].queue_pointer = &queue[ii];
    // set the expected status in queue error calback handling
-    user_data.expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
+    user_data[ii].expected_status = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;

    // create queue
    err = hsa_queue_create(gpuAgent,
-                           queue_max, HSA_QUEUE_TYPE_SINGLE,
-                           CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
+                       queue_max, HSA_QUEUE_TYPE_SINGLE,
+                       CallbackQueueErrorHandling, &user_data[ii], 0, 0, &queue[ii]);
    ASSERT_EQ(err, HSA_STATUS_SUCCESS);


@@ -486,12 +487,12 @@ void QueueValidation::QueueValidationForInvalidPacket(hsa_agent_t cpuAgent,
    // completion signal should not be changed.
    ASSERT_EQ(completion, 1);

-    // queue error handling callback  should be triggered
-    ASSERT_EQ(user_data.cb_triggered, true);
-
    hsa_signal_store_relaxed(aql().completion_signal, 1);
  }
+  sleep(1);
  for (ii = 0; ii < kMaxQueue; ++ii) {
+    // queue error handling callback  should be triggered
+    ASSERT_EQ(user_data[ii].cb_triggered, true);
    if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
  }
 }
@@ -514,22 +515,22 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen
  queue_max = (queue_max < kMaxQueueSizeForAgent) ? queue_max: kMaxQueueSizeForAgent;

  hsa_queue_t *queue[kMaxQueue];  // command queue
+  test_validation_data user_data[kMaxQueue][3];
  uint32_t ii;
  for (ii = 0; ii < kMaxQueue; ++ii) {
    uint32_t jj;
    for (jj = 1; jj <= 3; ++jj) {
-      test_validation_data user_data;
      // set callback flag to false if callback called then it will change to true
-      user_data.cb_triggered = false;
+      user_data[ii][jj - 1].cb_triggered = false;
      // set the queue pointer
-      user_data.queue_pointer = &queue[ii];
+      user_data[ii][jj - 1].queue_pointer = &queue[ii];
      // set the expected status in queue error calback handling
-      user_data.expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
+      user_data[ii][jj - 1].expected_status = HSA_STATUS_ERROR_INVALID_ARGUMENT;

      // create queue
      err = hsa_queue_create(gpuAgent,
-                             kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
-                             CallbackQueueErrorHandling, &user_data, 0, 0, &queue[ii]);
+              kMaxQueueSizeForAgent, HSA_QUEUE_TYPE_SINGLE,
+              CallbackQueueErrorHandling, &user_data[ii][jj - 1], 0, 0, &queue[ii]);
      ASSERT_EQ(err, HSA_STATUS_SUCCESS);


@@ -574,13 +575,17 @@ void QueueValidation::QueueValidationForInvalidWorkGroupSize(hsa_agent_t cpuAgen
      // completion signal should not be changed.
      ASSERT_EQ(completion, 1);

-      // queue error handling callback  should be triggered
-      ASSERT_EQ(user_data.cb_triggered, true);
-
      hsa_signal_store_relaxed(aql().completion_signal, 1);
      if (queue[ii]) { hsa_queue_destroy(queue[ii]); }
    }
  }
+  sleep(1);
+  for (uint32_t ii = 0; ii < kMaxQueue; ++ii) {
+    for (uint32_t jj = 0; jj < 3; ++jj) {
+      // queue error handling callback  should be triggered
+      ASSERT_EQ(user_data[ii][jj].cb_triggered, true);
+    }
+  }
 }


@@ -54,7 +54,7 @@ set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-R
 set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCR-Runtime" )


-set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
+set(DEFAULT_TARGETS "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")

 #
 # Currently support for Windows platform is not present
@@ -128,11 +128,6 @@ TEST(rocrtst, Test_Example) {
  RunGenericTest(&tst);
 }

-TEST(rocrtstFunc, IPC) {
-  IPCTest ipc;
-  RunGenericTest(&ipc);
-}
-
 TEST(rocrtstFunc, MemoryAccessTests) {
  MemoryAccessTest mt;
  RunCustomTestProlog(&mt);
@@ -175,7 +170,6 @@ TEST(rocrtstFunc, Concurrent_Shutdown) {
  RunCustomTestEpilog(&cs);
 }

-
 TEST(rocrtstFunc, Reference_Count) {
  ReferenceCountTest rc(true, false);
  RunCustomTestProlog(&rc);
@@ -212,6 +206,11 @@ TEST(rocrtstFunc, Signal_Create_Concurrently) {
 }

 #ifndef ROCRTST_EMULATOR_BUILD
+TEST(rocrtstFunc, IPC) {
+  IPCTest ipc;
+  RunGenericTest(&ipc);
+}
+
 TEST(rocrtstFunc, DISABLED_Signal_Kernel_Set) {
  SignalKernelTest sk(SET);
  RunCustomTestProlog(&sk);
@@ -1151,6 +1151,27 @@ hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal,
  return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr);
 }

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count) {
+  return amdExtTable->hsa_amd_svm_attributes_set_fn(ptr, size, attribute_list, attribute_count);
+}
+
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count) {
+  return amdExtTable->hsa_amd_svm_attributes_get_fn(ptr, size, attribute_list, attribute_count);
+}
+
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal) {
+  return amdExtTable->hsa_amd_svm_prefetch_async_fn(ptr, size, agent, num_dep_signals, dep_signals, completion_signal);
+}
+
 // Tools only table interfaces.
 namespace rocr {

@@ -173,7 +173,8 @@ class GpuAgent : public GpuAgentInt {
  // @param [in] node Node id. Each CPU in different socket will get distinct
  // id.
  // @param [in] node_props Node property.
-  GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props);
+  // @param [in] xnack_mode XNACK mode of device.
+  GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode);

  // @brief GPU agent destructor.
  ~GpuAgent();
@@ -439,6 +439,32 @@ static const unsigned int kCodeTrapHandler9[] = {
    0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
 };

+static const unsigned int kCodeTrapHandler90a[] = {
+    0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73,
+    0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900,
+    0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a,
+    0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff,
+    0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001,
+    0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f,
+    0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f,
+    0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f,
+    0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80,
+    0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044,
+    0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
+    0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4,
+    0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e,
+    0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c,
+    0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff,
+    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f,
+    0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f,
+    0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37,
+    0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010,
+    0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f,
+    0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001,
+    0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e,
+    0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
+};
+
 static const unsigned int kCodeCopyAligned8[] = {
    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
    0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050,
@@ -95,7 +95,7 @@ class MemoryRegion : public core::MemoryRegion {
  /// @brief Unpin memory.
  static void MakeKfdMemoryUnresident(const void* ptr);

-  MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
+  MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
               const HsaMemoryProperties& mem_props);

  ~MemoryRegion();
@@ -246,6 +246,21 @@ hsa_status_t hsa_amd_deregister_deallocation_callback(
 hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
                                          volatile hsa_signal_value_t** value_ptr);

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal);
+
 }  // namespace amd
 }  // namespace rocr

@@ -57,8 +57,8 @@ class Agent;

 class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
 public:
-  MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner)
-      : fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) {
+  MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner)
+      : fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), owner_(owner) {
    assert(owner_ != NULL);
  }

@@ -112,12 +112,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {

  __forceinline bool fine_grain() const { return fine_grain_; }

+  __forceinline bool kernarg() const { return kernarg_; }
+
  __forceinline bool full_profile() const { return full_profile_; }

  __forceinline core::Agent* owner() const { return owner_; }

 private:
  const bool fine_grain_;
+  const bool kernarg_;
  const bool full_profile_;

  core::Agent* owner_;
@@ -292,6 +292,15 @@ class Runtime {

  hsa_status_t IPCDetach(void* ptr);

+  hsa_status_t SetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+                            size_t attribute_count);
+
+  hsa_status_t GetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+                            size_t attribute_count);
+
+  hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals,
+                           const hsa_signal_t* dep_signals, hsa_signal_t completion_signal);
+
  const std::vector<Agent*>& cpu_agents() { return cpu_agents_; }

  const std::vector<Agent*>& gpu_agents() { return gpu_agents_; }
@@ -395,6 +404,28 @@ class Runtime {
    std::vector<void*> arg_;
  };

+  struct PrefetchRange;
+  typedef std::map<uintptr_t, PrefetchRange> prefetch_map_t;
+
+  struct PrefetchOp {
+    void* base;
+    size_t size;
+    uint32_t node_id;
+    int remaining_deps;
+    hsa_signal_t completion;
+    std::vector<hsa_signal_t> dep_signals;
+    prefetch_map_t::iterator prefetch_map_entry;
+  };
+
+  struct PrefetchRange {
+    PrefetchRange() {}
+    PrefetchRange(size_t Bytes, PrefetchOp* Op) : bytes(Bytes), op(Op) {}
+    size_t bytes;
+    PrefetchOp* op;
+    prefetch_map_t::iterator prev;
+    prefetch_map_t::iterator next;
+  };
+
  // Will be created before any user could call hsa_init but also could be
  // destroyed before incorrectly written programs call hsa_shutdown.
  static KernelMutex bootstrap_lock_;
@@ -444,6 +475,9 @@ class Runtime {
  /// @retval Index in ::link_matrix_.
  uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to);

+  /// @brief Get most recently issued SVM prefetch agent for the range in question.
+  Agent* GetSVMPrefetchAgent(void* ptr, size_t size);
+
  // Mutex object to protect multithreaded access to ::allocation_map_,
  // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
  // registered & mapped arrays.
@@ -485,6 +519,10 @@ class Runtime {
  // Contains the region, address, and size of previously allocated memory.
  std::map<const void*, AllocationRegion> allocation_map_;

+  // Pending prefetch containers.
+  KernelMutex prefetch_lock_;
+  prefetch_map_t prefetch_map_;
+
  // Allocator using ::system_region_
  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;

@@ -69,40 +69,31 @@ void CpuAgent::InitRegionList() {

  std::vector<HsaMemoryProperties> mem_props(properties_.NumMemoryBanks);
  if (HSAKMT_STATUS_SUCCESS ==
-      hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
-                                    &mem_props[0])) {
+      hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, &mem_props[0])) {
    std::vector<HsaMemoryProperties>::iterator system_prop =
        std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool {
          return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM);
        });

-    if (system_prop != mem_props.end()) {
-      MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop);
+    HsaMemoryProperties system_props;
+    std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
+    system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
+    system_props.SizeInBytes = 0;
+    system_props.VirtualBaseAddress = 0;

-      regions_.push_back(system_region_fine);
+    if (system_prop != mem_props.end()) system_props = *system_prop;

-      if (!is_apu_node) {
-        MemoryRegion* system_region_coarse =
-            new MemoryRegion(false, is_apu_node, this, *system_prop);
+    MemoryRegion* system_region_fine =
+        new MemoryRegion(true, false, is_apu_node, this, system_props);
+    regions_.push_back(system_region_fine);
+    MemoryRegion* system_region_kernarg =
+        new MemoryRegion(true, true, is_apu_node, this, system_props);
+    regions_.push_back(system_region_kernarg);

-        regions_.push_back(system_region_coarse);
-      }
-    } else {
-      HsaMemoryProperties system_props;
-      std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
-
-      system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
-      system_props.SizeInBytes = 0;
-      system_props.VirtualBaseAddress = 0;
-
-      MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props);
-      regions_.push_back(system_region_fine);
-
-      if (!is_apu_node) {
-        MemoryRegion* system_region_coarse =
-            new MemoryRegion(false, is_apu_node, this, system_props);
-        regions_.push_back(system_region_coarse);
-      }
+    if (!is_apu_node) {
+      MemoryRegion* system_region_coarse =
+          new MemoryRegion(false, false, is_apu_node, this, *system_prop);
+      regions_.push_back(system_region_coarse);
    }
  }
 }
@@ -77,7 +77,7 @@ extern HsaApiTable hsa_internal_api_table_;
 } // namespace core

 namespace AMD {
-GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
+GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode)
    : GpuAgentInt(node),
      properties_(node_props),
      current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
@@ -112,16 +112,21 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)

  rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported;
  if (isa_base->IsSrameccSupported()) {
-    sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1
-                   ? core::IsaFeature::Enabled
-                   : core::IsaFeature::Disabled;
+    sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled
+                                                              : core::IsaFeature::Disabled;
+    // sramecc control for emulator.
+    if (core::Runtime::runtime_singleton_->flag().sramecc() != Flag::FLAG_DEFAULT) {
+      sramecc = core::Runtime::runtime_singleton_->flag().sramecc() == Flag::FLAG_ENABLE
+          ? core::IsaFeature::Enabled
+          : core::IsaFeature::Disabled;
+    }
  }

  rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported;
  if (isa_base->IsXnackSupported()) {
    // TODO: This needs to be obtained form KFD once HMM implemented.
-    xnack = profile_ == HSA_PROFILE_FULL ? core::IsaFeature::Enabled
-                                         : core::IsaFeature::Disabled;
+    xnack = xnack_mode ? core::IsaFeature::Enabled
+                      : core::IsaFeature::Disabled;
  }

  // Set instruction set architecture via node property, only on GPU device.
@@ -202,6 +207,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
    ASICShader compute_7;
    ASICShader compute_8;
    ASICShader compute_9;
+    ASICShader compute_90a;
    ASICShader compute_1010;
    ASICShader compute_10;
  };
@@ -212,6 +218,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
           {NULL, 0, 0, 0},
           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
+           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},
           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},
           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},
       }},
@@ -220,6 +227,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
       }},
@@ -228,6 +236,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
       }},
@@ -236,6 +245,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
           {kCodeFill7, sizeof(kCodeFill7), 19, 8},
           {kCodeFill8, sizeof(kCodeFill8), 19, 8},
           {kCodeFill8, sizeof(kCodeFill8), 19, 8},
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},
           {kCodeFill10, sizeof(kCodeFill10), 19, 8},
           {kCodeFill10, sizeof(kCodeFill10), 19, 8},
       }}};
@@ -254,6 +264,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
      asic_shader = &compiled_shader_it->second.compute_8;
      break;
    case 9:
+      if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10))
+        asic_shader = &compiled_shader_it->second.compute_90a;
+      else
        asic_shader = &compiled_shader_it->second.compute_9;
      break;
    case 10:
@@ -302,6 +315,14 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
                     AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
    AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
                     AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+
+    if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) &&
+        (isa_->GetStepping() == 10)) {
+      // Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a.
+      // FIXME: Assemble code objects from source at build time
+      int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1;
+      header->max_scratch_backing_memory_byte_size = uint64_t(gran_accvgprs) << 32;
+    }
  }

  // Copy shader code into the GPU-visible buffer.
@@ -338,8 +359,7 @@ void GpuAgent::InitRegionList() {
          memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
        case HSA_HEAPTYPE_GPU_LDS:
        case HSA_HEAPTYPE_GPU_SCRATCH: {
-          MemoryRegion* region =
-              new MemoryRegion(false, false, this, mem_props[mem_idx]);
+          MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]);

          regions_.push_back(region);

@@ -348,7 +368,7 @@ void GpuAgent::InitRegionList() {
            // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
            if ((properties_.HiveID != 0) ||
                (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
-              regions_.push_back(new MemoryRegion(true, false, this, mem_props[mem_idx]));
+              regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx]));
            }
          }
          break;
@@ -100,9 +100,9 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
  hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
 }

-MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
+MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
                           const HsaMemoryProperties& mem_props)
-    : core::MemoryRegion(fine_grain, full_profile, owner),
+    : core::MemoryRegion(fine_grain, kernarg, full_profile, owner),
      mem_props_(mem_props),
      max_single_alloc_size_(0),
      virtual_size_(0),
@@ -128,6 +128,8 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne
    mem_flag_.ui32.HostAccess = 1;
    mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED;

+    if (kernarg) mem_flag_.ui32.Uncached = 1;
+
    virtual_size_ =
        (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize;
  }
@@ -290,16 +292,14 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
    case HSA_REGION_INFO_GLOBAL_FLAGS:
      switch (mem_props_.HeapType) {
        case HSA_HEAPTYPE_SYSTEM:
-          *((uint32_t*)value) = fine_grain()
-                                    ? (HSA_REGION_GLOBAL_FLAG_KERNARG |
-                                       HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
-                                    : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
-          break;
-        case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
        case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
-          *((uint32_t*)value) = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
-                                             : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
+        case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: {
+          uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
+                                      : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
+          if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG;
+          *((uint32_t*)value) = ret;
          break;
+        }
        default:
          *((uint32_t*)value) = 0;
          break;
@@ -68,6 +68,38 @@ namespace AMD {
 static const uint kKfdVersionMajor = 0;
 static const uint kKfdVersionMinor = 99;

+// Query for user preference and use that to determine Xnack mode of ROCm system.
+// Return true if Xnack mode is ON or false if OFF. Xnack mode of a system is
+// orthogonal to devices that do not support Xnack mode. It is legal for a
+// system with Xnack ON to have devices that do not support Xnack functionality.
+bool BindXnackMode() {
+  // Get users' preference for Xnack mode of ROCm platform
+  HSAint32 mode;
+  mode = core::Runtime::runtime_singleton_->flag().xnack();
+  bool config_xnack =
+      (core::Runtime::runtime_singleton_->flag().xnack() != Flag::XNACK_REQUEST::XNACK_UNCHANGED);
+
+  // Indicate to driver users' preference for Xnack mode
+  // Call to driver can fail and is a supported feature
+  HSAKMT_STATUS status = HSAKMT_STATUS_ERROR;
+  if (config_xnack) {
+    status = hsaKmtSetXNACKMode(mode);
+    if (status == HSAKMT_STATUS_SUCCESS) {
+      return mode;
+    }
+  }
+
+  // Get Xnack mode of devices bound by driver. This could happen
+  // when a call to SET Xnack mode fails or user has no particular
+  // preference
+  status = hsaKmtGetXNACKMode((HSAint32*)&mode);
+  if(status != HSAKMT_STATUS_SUCCESS) {
+    debug_print("KFD does not support xnack mode query.\nROCr must assume xnack is disabled.\n");
+    return false;
+  }
+  return mode;
+}
+
 CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
  if (node_prop.NumCPUCores == 0) {
    return nullptr;
@@ -79,14 +111,14 @@ CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
  return cpu;
 }

-GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
+GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnack_mode) {
  GpuAgent* gpu = nullptr;
  if (node_prop.NumFComputeCores == 0) {
      // Ignore non GPUs.
      return nullptr;
  }
  try {
-    gpu = new GpuAgent(node_id, node_prop);
+    gpu = new GpuAgent(node_id, node_prop, xnack_mode);

    const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion();

@@ -174,20 +206,26 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
        link_info.atomic_support_32bit = true;
        link_info.atomic_support_64bit = true;
        link_info.coherent_support = true;
+        if (core::Runtime::runtime_singleton_->flag().patch_xgmi_link_weight()) {
+          io_link.Weight = 15;
+        }
        break;
      default:
        debug_print("Unrecognized IOLINK type.\n");
        break;
    }

-    if (io_link.Flags.ui32.Override == 1) {
-      if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) {
-        // Ignore this link since peer to peer is not allowed.
-        continue;
+    // KFD is reporting wrong override status for XGMI.  Disallow override for bringup.
+    if (!core::Runtime::runtime_singleton_->flag().patch_link_override()) {
+      if (io_link.Flags.ui32.Override == 1) {
+        if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) {
+          // Ignore this link since peer to peer is not allowed.
+          continue;
+        }
+        link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0);
+        link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0);
+        link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0);
      }
-      link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0);
-      link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0);
-      link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0);
    }

    link_info.max_bandwidth = io_link.MaximumBandwidth;
@@ -204,7 +242,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
 /**
 * Process the list of Gpus that are surfaced to user
 */
-static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
+static void SurfaceGpuList(std::vector<int32_t>& gpu_list, bool xnack_mode) {
  // Process user visible Gpu devices
  int32_t invalidIdx = -1;
  int32_t list_sz = gpu_list.size();
@@ -221,7 +259,7 @@ static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
    // Instantiate a Gpu device. The IO links
    // of this node have already been registered
    assert((node_prop.NumFComputeCores != 0) && "Improper node used for GPU device discovery.");
-    DiscoverGpu(gpu_list[idx], node_prop);
+    DiscoverGpu(gpu_list[idx], node_prop, xnack_mode);
  }
 }

@@ -305,8 +343,11 @@ void BuildTopology() {
    RegisterLinkInfo(node_id, node_prop.NumIOLinks);
  }

+  // Determine the Xnack mode to be bound for system
+  bool xnack_mode = BindXnackMode();
+
  // Instantiate ROCr objects to encapsulate Gpu devices
-  SurfaceGpuList(gpu_usr_list);
+  SurfaceGpuList(gpu_usr_list, xnack_mode);
 }

 bool Load() {
@@ -391,6 +391,9 @@ void HsaApiTable::UpdateAmdExts() {
  amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback;
  amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback;
  amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer;
+  amd_ext_api.hsa_amd_svm_attributes_set_fn = AMD::hsa_amd_svm_attributes_set;
+  amd_ext_api.hsa_amd_svm_attributes_get_fn = AMD::hsa_amd_svm_attributes_get;
+  amd_ext_api.hsa_amd_svm_prefetch_async_fn = AMD::hsa_amd_svm_prefetch_async;
 }

 void LoadInitialHsaApiTable() {
@@ -1009,5 +1009,37 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie
  CATCH;
 }

+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count) {
+  TRY;
+  IS_OPEN();
+  return core::Runtime::runtime_singleton_->SetSvmAttrib(ptr, size, attribute_list,
+                                                         attribute_count);
+  CATCH;
+}
+
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count) {
+  TRY;
+  IS_OPEN();
+  return core::Runtime::runtime_singleton_->GetSvmAttrib(ptr, size, attribute_list,
+                                                         attribute_count);
+  CATCH;
+}
+
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal) {
+  TRY;
+  IS_OPEN();
+  // Validate inputs.
+  // if (core::g_use_interrupt_wait && (!core::InterruptSignal::IsType(signal)))
+  return core::Runtime::runtime_singleton_->SvmPrefetch(ptr, size, agent, num_dep_signals,
+                                                        dep_signals, completion_signal);
+  CATCH;
+}
+
 }   //  namespace amd
 }   //  namespace rocr
@@ -281,6 +281,15 @@ constexpr size_t hsa_name_size = 63;
  ISAREG_ENTRY_GEN("gfx908:sramecc-:xnack+", 9, 0, 8,  disabled,    enabled)
  ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack-", 9, 0, 8,  enabled,     disabled)
  ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack+", 9, 0, 8,  enabled,     enabled)
+  ISAREG_ENTRY_GEN("gfx90a",                 9, 0, 10, any,         any)
+  ISAREG_ENTRY_GEN("gfx90a:xnack-",          9, 0, 10, any,         disabled)
+  ISAREG_ENTRY_GEN("gfx90a:xnack+",          9, 0, 10, any,         enabled)
+  ISAREG_ENTRY_GEN("gfx90a:sramecc-",        9, 0, 10, disabled,    any)
+  ISAREG_ENTRY_GEN("gfx90a:sramecc+",        9, 0, 10, enabled,     any)
+  ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack-", 9, 0, 10, disabled,    disabled)
+  ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack+", 9, 0, 10, disabled,    enabled)
+  ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack-", 9, 0, 10, enabled,     disabled)
+  ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack+", 9, 0, 10, enabled,     enabled)
  ISAREG_ENTRY_GEN("gfx1010",                10, 1, 0, unsupported, any)
  ISAREG_ENTRY_GEN("gfx1010:xnack-",         10, 1, 0, unsupported, disabled)
  ISAREG_ENTRY_GEN("gfx1010:xnack+",         10, 1, 0, unsupported, enabled)
@@ -60,6 +60,7 @@
 #include "core/inc/hsa_ext_amd_impl.h"
 #include "core/inc/hsa_api_trace_int.h"
 #include "core/util/os.h"
+#include "core/inc/exceptions.h"
 #include "inc/hsa_ven_amd_aqlprofile.h"

 #define HSA_VERSION_MAJOR 1
@@ -173,16 +174,29 @@ void Runtime::RegisterAgent(Agent* agent) {
    if (cpu_agents_.size() == 1) {
      // Might need memory pooling to cover allocation that
      // requires less than 4096 bytes.
-      system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* {
-        assert(align <= 4096);
-        void* ptr = nullptr;
-        core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr);
-        return ptr;
-      };

-      system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
+      // Default system pool must support kernarg
+      for (auto pool : system_regions_fine_) {
+        if (pool->kernarg()) {
+          system_allocator_ = [pool](size_t size, size_t alignment,
+                                     MemoryRegion::AllocateFlags alloc_flags) -> void* {
+            assert(alignment <= 4096);
+            void* ptr = NULL;
+            return (HSA_STATUS_SUCCESS ==
+                    core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
+                                                                      &ptr))
+                ? ptr
+                : NULL;
+          };

-      BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
+          system_deallocator_ = [](void* ptr) {
+            core::Runtime::runtime_singleton_->FreeMemory(ptr);
+          };
+
+          BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
+          break;
+        }
+      }
    }
  } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
    gpu_agents_.push_back(agent);
@@ -630,6 +644,18 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
      *(const char**)value = STRING(ROCR_BUILD_ID);
      break;
    }
+    case HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: {
+      // todo: Get HMM kernel support info.
+      *(bool*)value = true;
+      break;
+    }
+    case HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: {
+      bool ret = true;
+      for(auto agent : gpu_agents_)
+        ret &= (agent->isa()->GetXnack() == IsaFeature::Enabled);
+      *(bool*)value = ret;
+      break;
+    }
    default:
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
@@ -1572,5 +1598,499 @@ void Runtime::InternalQueueCreateNotify(const hsa_queue_t* queue, hsa_agent_t ag
    internal_queue_create_notifier_(queue, agent, internal_queue_create_notifier_user_data_);
 }

+hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size,
+                                   hsa_amd_svm_attribute_pair_t* attribute_list,
+                                   size_t attribute_count) {
+  uint32_t set_attribs = 0;
+  std::vector<bool> agent_seen(agents_by_node_.size(), false);
+
+  std::vector<HSA_SVM_ATTRIBUTE> attribs;
+  attribs.reserve(attribute_count);
+  uint32_t set_flags = 0;
+  uint32_t clear_flags = 0;
+
+  auto Convert = [&](uint64_t value) -> Agent* {
+    hsa_agent_t handle = {value};
+    Agent* agent = Agent::Convert(handle);
+    if ((agent == nullptr) || !agent->IsValid())
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
+                               "Invalid agent handle in Runtime::SetSvmAttrib.");
+    return agent;
+  };
+
+  auto ConvertAllowNull = [&](uint64_t value) -> Agent* {
+    hsa_agent_t handle = {value};
+    Agent* agent = Agent::Convert(handle);
+    if ((agent != nullptr) && (!agent->IsValid()))
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
+                               "Invalid agent handle in Runtime::SetSvmAttrib.");
+    return agent;
+  };
+
+  auto ConfirmNew = [&](Agent* agent) {
+    if (agent_seen[agent->node_id()])
+      throw AMD::hsa_exception(
+          HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
+          "Multiple attributes given for the same agent in Runtime::SetSvmAttrib.");
+    agent_seen[agent->node_id()] = true;
+  };
+
+  auto Check = [&](uint64_t attrib) {
+    if (set_attribs & (1 << attrib))
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
+                               "Attribute given multiple times in Runtime::SetSvmAttrib.");
+    set_attribs |= (1 << attrib);
+  };
+
+  auto kmtPair = [](uint32_t attrib, uint32_t value) {
+    HSA_SVM_ATTRIBUTE pair = {attrib, value};
+    return pair;
+  };
+
+  for (uint32_t i = 0; i < attribute_count; i++) {
+    auto attrib = attribute_list[i].attribute;
+    auto value = attribute_list[i].value;
+
+    switch (attrib) {
+      case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: {
+        Check(attrib);
+        switch (value) {
+          case HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED:
+            set_flags |= HSA_SVM_FLAG_COHERENT;
+            break;
+          case HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED:
+            clear_flags |= HSA_SVM_FLAG_COHERENT;
+            break;
+          default:
+            throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
+                                     "Invalid HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG value.");
+        }
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_READ_ONLY: {
+        Check(attrib);
+        if (value)
+          set_flags |= HSA_SVM_FLAG_GPU_RO;
+        else
+          clear_flags |= HSA_SVM_FLAG_GPU_RO;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
+        Check(attrib);
+        if (value)
+          set_flags |= HSA_SVM_FLAG_HIVE_LOCAL;
+        else
+          clear_flags |= HSA_SVM_FLAG_HIVE_LOCAL;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
+        Check(attrib);
+        // Max migration size is 1GB.
+        if (value > 18) value = 18;
+        attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, value));
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
+        Check(attrib);
+        Agent* agent = ConvertAllowNull(value);
+        if (agent == nullptr)
+          attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, INVALID_NODEID));
+        else
+          attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, agent->node_id()));
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: {
+        Agent* agent = Convert(value);
+        ConfirmNew(agent);
+        if (agent->device_type() == Agent::kAmdCpuDevice) {
+          set_flags |= HSA_SVM_FLAG_HOST_ACCESS;
+        } else {
+          attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id()));
+        }
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: {
+        Agent* agent = Convert(value);
+        ConfirmNew(agent);
+        if (agent->device_type() == Agent::kAmdCpuDevice) {
+          set_flags |= HSA_SVM_FLAG_HOST_ACCESS;
+        } else {
+          attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS_IN_PLACE, agent->node_id()));
+        }
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: {
+        Agent* agent = Convert(value);
+        ConfirmNew(agent);
+        if (agent->device_type() == Agent::kAmdCpuDevice) {
+          clear_flags |= HSA_SVM_FLAG_HOST_ACCESS;
+        } else {
+          attribs.push_back(kmtPair(HSA_SVM_ATTR_NO_ACCESS, agent->node_id()));
+        }
+        break;
+      }
+      default:
+        throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
+                                 "Illegal or invalid attribute in Runtime::SetSvmAttrib");
+    }
+  }
+
+  // Merge CPU access properties - grant access if any CPU needs access.
+  // Probably wrong.
+  if (set_flags & HSA_SVM_FLAG_HOST_ACCESS) clear_flags &= ~HSA_SVM_FLAG_HOST_ACCESS;
+
+  // Add flag updates
+  if (clear_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_CLR_FLAGS, clear_flags));
+  if (set_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, set_flags));
+
+  uint8_t* base = AlignDown((uint8_t*)ptr, 4096);
+  uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096);
+  size_t len = end - base;
+  HSAKMT_STATUS error = hsaKmtSVMSetAttr(base, len, attribs.size(), &attribs[0]);
+  if (error != HSAKMT_STATUS_SUCCESS)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMSetAttr failed.");
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t Runtime::GetSvmAttrib(void* ptr, size_t size,
+                                   hsa_amd_svm_attribute_pair_t* attribute_list,
+                                   size_t attribute_count) {
+  std::vector<HSA_SVM_ATTRIBUTE> attribs;
+  attribs.reserve(attribute_count);
+
+  std::vector<int> kmtIndices(attribute_count);
+
+  bool getFlags = false;
+
+  auto Convert = [&](uint64_t value) -> Agent* {
+    hsa_agent_t handle = {value};
+    Agent* agent = Agent::Convert(handle);
+    if ((agent == nullptr) || !agent->IsValid())
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT,
+                               "Invalid agent handle in Runtime::GetSvmAttrib.");
+    return agent;
+  };
+
+  auto kmtPair = [](uint32_t attrib, uint32_t value) {
+    HSA_SVM_ATTRIBUTE pair = {attrib, value};
+    return pair;
+  };
+
+  for (uint32_t i = 0; i < attribute_count; i++) {
+    auto& attrib = attribute_list[i].attribute;
+    auto& value = attribute_list[i].value;
+
+    switch (attrib) {
+      case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG:
+      case HSA_AMD_SVM_ATTRIB_READ_ONLY:
+      case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
+        getFlags = true;
+        kmtIndices[i] = -1;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
+        kmtIndices[i] = attribs.size();
+        attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, 0));
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
+        kmtIndices[i] = attribs.size();
+        attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, 0));
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: {
+        value = Agent::Convert(GetSVMPrefetchAgent(ptr, size)).handle;
+        kmtIndices[i] = -1;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: {
+        Agent* agent = Convert(value);
+        if (agent->device_type() == Agent::kAmdCpuDevice) {
+          getFlags = true;
+          kmtIndices[i] = -1;
+        } else {
+          kmtIndices[i] = attribs.size();
+          attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id()));
+        }
+        break;
+      }
+      default:
+        throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
+                                 "Illegal or invalid attribute in Runtime::SetSvmAttrib");
+    }
+  }
+
+  if (getFlags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, 0));
+
+  uint8_t* base = AlignDown((uint8_t*)ptr, 4096);
+  uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096);
+  size_t len = end - base;
+  if (attribs.size() != 0) {
+    HSAKMT_STATUS error = hsaKmtSVMGetAttr(base, len, attribs.size(), &attribs[0]);
+    if (error != HSAKMT_STATUS_SUCCESS)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMGetAttr failed.");
+  }
+
+  for (uint32_t i = 0; i < attribute_count; i++) {
+    auto& attrib = attribute_list[i].attribute;
+    auto& value = attribute_list[i].value;
+
+    switch (attrib) {
+      case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: {
+        if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_COHERENT)
+          value = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED;
+        else
+          value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_READ_ONLY: {
+        value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_GPU_RO);
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: {
+        value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HIVE_LOCAL);
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: {
+        value = attribs[kmtIndices[i]].value;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: {
+        uint64_t node = attribs[kmtIndices[i]].value;
+        Agent* agent = nullptr;
+        if (node != INVALID_NODEID) agent = agents_by_node_[node][0];
+        value = Agent::Convert(agent).handle;
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: {
+        break;
+      }
+      case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: {
+        if (kmtIndices[i] == -1) {
+          if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HOST_ACCESS)
+            attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
+        } else {
+          switch (attribs[kmtIndices[i]].type) {
+            case HSA_SVM_ATTR_ACCESS:
+              attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
+              break;
+            case HSA_SVM_ATTR_ACCESS_IN_PLACE:
+              attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
+              break;
+            case HSA_SVM_ATTR_NO_ACCESS:
+              attrib = HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS;
+              break;
+            default:
+              assert(false && "Bad agent accessibility from KFD.");
+          }
+        }
+        break;
+      }
+      default:
+        throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
+                                 "Illegal or invalid attribute in Runtime::GetSvmAttrib");
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent,
+                                  uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                  hsa_signal_t completion_signal) {
+  uintptr_t base = reinterpret_cast<uintptr_t>(AlignDown(ptr, 4096));
+  uintptr_t end = AlignUp(reinterpret_cast<uintptr_t>(ptr) + size, 4096);
+  size_t len = end - base;
+
+  PrefetchOp* op = new PrefetchOp();
+  MAKE_NAMED_SCOPE_GUARD(OpGuard, [&]() { delete op; });
+
+  Agent* dest = Agent::Convert(agent);
+  if (dest->device_type() == Agent::kAmdCpuDevice)
+    op->node_id = 0;
+  else
+    op->node_id = dest->node_id();
+
+  op->base = reinterpret_cast<void*>(base);
+  op->size = len;
+  op->completion = completion_signal;
+  if (num_dep_signals > 1) {
+    op->remaining_deps = num_dep_signals - 1;
+    for (int i = 0; i < num_dep_signals - 1; i++) op->dep_signals.push_back(dep_signals[i]);
+  } else {
+    op->remaining_deps = 0;
+  }
+
+  {
+    ScopedAcquire<KernelMutex> lock(&prefetch_lock_);
+    // Remove all fully overlapped and trim partially overlapped ranges.
+    // Get iteration bounds
+    auto start = prefetch_map_.upper_bound(base);
+    if (start != prefetch_map_.begin()) start--;
+    auto stop = prefetch_map_.lower_bound(end);
+
+    auto isEndNode = [&](decltype(start) node) { return node->second.next == prefetch_map_.end(); };
+    auto isFirstNode = [&](decltype(start) node) {
+      return node->second.prev == prefetch_map_.end();
+    };
+
+    // Trim and remove old ranges.
+    while (start != stop) {
+      uintptr_t startBase = start->first;
+      uintptr_t startEnd = startBase + start->second.bytes;
+
+      auto ibase = Max(startBase, base);
+      auto iend = Min(startEnd, end);
+      // Check for overlap
+      if (ibase < iend) {
+        // Second range check
+        if (iend < startEnd) {
+          auto ret = prefetch_map_.insert(
+              std::make_pair(iend, PrefetchRange(startEnd - iend, start->second.op)));
+          assert(ret.second && "Prefetch map insert failed during range split.");
+
+          auto it = ret.first;
+          it->second.prev = start;
+          it->second.next = start->second.next;
+          start->second.next = it;
+          if (!isEndNode(it)) it->second.next->second.prev = it;
+        }
+
+        // Is the first interval of the old range valid
+        if (startBase < ibase) {
+          start->second.bytes = ibase - startBase;
+        } else {
+          if (isFirstNode(start)) {
+            start->second.op->prefetch_map_entry = start->second.next;
+            if (!isEndNode(start)) start->second.next->second.prev = prefetch_map_.end();
+          } else {
+            start->second.prev->second.next = start->second.next;
+            if (!isEndNode(start)) start->second.next->second.prev = start->second.prev;
+          }
+          prefetch_map_.erase(start);
+        }
+      }
+      start++;
+    }
+
+    // Insert new range.
+    auto ret = prefetch_map_.insert(std::make_pair(base, PrefetchRange(len, op)));
+    assert(ret.second && "Prefetch map insert failed.");
+
+    auto it = ret.first;
+    op->prefetch_map_entry = it;
+    it->second.next = it->second.prev = prefetch_map_.end();
+  }
+
+  // Remove the prefetch's ranges from the map.
+  static auto removePrefetchRanges = [](PrefetchOp* op) {
+    ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
+    auto it = op->prefetch_map_entry;
+    while (it != Runtime::runtime_singleton_->prefetch_map_.end()) {
+      auto next = it->second.next;
+      Runtime::runtime_singleton_->prefetch_map_.erase(it);
+      it = next;
+    }
+  };
+
+  // Prefetch Signal handler for synchronization.
+  static hsa_amd_signal_handler signal_handler = [](hsa_signal_value_t value, void* arg) {
+    PrefetchOp* op = reinterpret_cast<PrefetchOp*>(arg);
+
+    if (op->remaining_deps > 0) {
+      op->remaining_deps--;
+      Runtime::runtime_singleton_->SetAsyncSignalHandler(
+          op->dep_signals[op->remaining_deps], HSA_SIGNAL_CONDITION_EQ, 0, signal_handler, arg);
+      return false;
+    }
+
+    HSA_SVM_ATTRIBUTE attrib;
+    attrib.type = HSA_SVM_ATTR_PREFETCH_LOC;
+    attrib.value = op->node_id;
+    HSAKMT_STATUS error = hsaKmtSVMSetAttr(op->base, op->size, 1, &attrib);
+    assert(error == HSAKMT_STATUS_SUCCESS && "KFD Prefetch failed.");
+
+    removePrefetchRanges(op);
+
+    if (op->completion.handle != 0) Signal::Convert(op->completion)->SubRelaxed(1);
+    delete op;
+
+    return false;
+  };
+
+  auto no_dependencies = [](void* arg) { signal_handler(0, arg); };
+
+  MAKE_NAMED_SCOPE_GUARD(RangeGuard, [&]() { removePrefetchRanges(op); });
+
+  hsa_status_t err;
+  if (num_dep_signals == 0)
+    err = AMD::hsa_amd_async_function(no_dependencies, op);
+  else
+    err = SetAsyncSignalHandler(dep_signals[num_dep_signals - 1], HSA_SIGNAL_CONDITION_EQ, 0,
+                                signal_handler, op);
+  if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Signal handler unable to be set.");
+
+  RangeGuard.Dismiss();
+  OpGuard.Dismiss();
+  return HSA_STATUS_SUCCESS;
+}
+
+Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) {
+  uintptr_t base = reinterpret_cast<uintptr_t>(AlignDown(ptr, 4096));
+  uintptr_t end = AlignUp(reinterpret_cast<uintptr_t>(ptr) + size, 4096);
+  size_t len = end - base;
+
+  std::vector<std::pair<uintptr_t, uintptr_t>> holes;
+
+  ScopedAcquire<KernelMutex> lock(&Runtime::runtime_singleton_->prefetch_lock_);
+  auto start = prefetch_map_.upper_bound(base);
+  if (start != prefetch_map_.begin()) start--;
+  auto stop = prefetch_map_.lower_bound(end);
+
+  // KFD returns -1 for no or mixed destinations.
+  uint32_t prefetch_node = -2;
+  if (start != stop) {
+    prefetch_node = start->second.op->node_id;
+  }
+
+  while (start != stop) {
+    uintptr_t startBase = start->first;
+    uintptr_t startEnd = startBase + start->second.bytes;
+
+    auto ibase = Max(base, startBase);
+    auto iend = Min(end, startEnd);
+    // Check for intersection with the query
+    if (ibase < iend) {
+      // If prefetch locations are different then we report null agent.
+      if (prefetch_node != start->second.op->node_id) return nullptr;
+
+      // Push leading gap to an array for checking KFD.
+      if (base < ibase) holes.push_back(std::make_pair(base, ibase - base));
+
+      // Trim query range.
+      base = iend;
+    }
+    start++;
+  }
+  if (base < end) holes.push_back(std::make_pair(base, end - base));
+
+  HSA_SVM_ATTRIBUTE attrib;
+  attrib.type = HSA_SVM_ATTR_PREFETCH_LOC;
+  for (auto& range : holes) {
+    HSAKMT_STATUS error =
+        hsaKmtSVMGetAttr(reinterpret_cast<void*>(range.first), range.second, 1, &attrib);
+    assert(error == HSAKMT_STATUS_SUCCESS && "KFD prefetch query failed.");
+
+    if (attrib.value == -1) return nullptr;
+    if (prefetch_node == -2) prefetch_node = attrib.value;
+    if (prefetch_node != attrib.value) return nullptr;
+  }
+
+  assert(prefetch_node != -2 && "prefetch_node was not updated.");
+  assert(prefetch_node != -1 && "Should have already returned.");
+  return agents_by_node_[prefetch_node][0];
+}
+
 }  // namespace core
 }  // namespace rocr
@@ -56,6 +56,13 @@ class Flag {
 public:
  enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT };

+  // The values are meaningful and chosen to satisfy the thunk API.
+  enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 };
+  static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed.");
+  static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed.");
+
+  enum FLAG_TRI_STATE { FLAG_DISABLE = 0, FLAG_ENABLE = 1, FLAG_DEFAULT = 2 };
+
  explicit Flag() { Refresh(); }

  virtual ~Flag() {}
@@ -104,6 +111,12 @@ class Flag {
    var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR");
    disable_fragment_alloc_ = (var == "1") ? true : false;

+    var = os::GetEnvVar("HSA_UNPATCH_XGMI_LINK_WEIGHT");
+    patch_xgmi_link_weight_ = (var == "1") ? false : true;
+
+    var = os::GetEnvVar("HSA_UNPATCH_LINK_OVERRIDE");
+    patch_link_override_ = (var == "1") ? false : true;
+
    var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH");
    enable_sdma_hdp_flush_ = (var == "0") ? false : true;

@@ -130,6 +143,16 @@ class Flag {

    var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT");
    check_sramecc_validity_ = (var == "1") ? false : true;
+    
+    // Legal values are zero "0" or one "1". Any other value will
+    // be interpreted as not defining the env variable
+    var = os::GetEnvVar("HSA_XNACK");
+    xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED);
+
+    // Legal values are zero "0" or one "1". Any other value will
+    // be interpreted as not defining the env variable.
+    var = os::GetEnvVar("HSA_FORCE_SRAMECC");
+    sramecc_ = (var == "0") ? FLAG_DISABLE : ((var == "1") ? FLAG_ENABLE : FLAG_DEFAULT);
  }

  bool check_flat_scratch() const { return check_flat_scratch_; }
@@ -150,6 +173,11 @@ class Flag {

  bool disable_fragment_alloc() const { return disable_fragment_alloc_; }

+  // Temporary way to control ROCr interpretation of inter-device link weight
+  bool patch_xgmi_link_weight() const { return patch_xgmi_link_weight_; }
+
+  bool patch_link_override() const { return patch_link_override_; }
+
  bool rev_copy_dir() const { return rev_copy_dir_; }

  bool fine_grain_pcie() const { return fine_grain_pcie_; }
@@ -178,6 +206,10 @@ class Flag {

  bool check_sramecc_validity() const { return check_sramecc_validity_; }

+  XNACK_REQUEST xnack() const { return xnack_; }
+
+  FLAG_TRI_STATE sramecc() const { return sramecc_; }
+
 private:
  bool check_flat_scratch_;
  bool enable_vm_fault_message_;
@@ -195,6 +227,8 @@ class Flag {
  bool disable_image_;
  bool loader_enable_mmap_uri_;
  bool check_sramecc_validity_;
+  bool patch_xgmi_link_weight_;
+  bool patch_link_override_;

  SDMA_OVERRIDE enable_sdma_;

@@ -209,6 +243,12 @@ class Flag {

  size_t force_sdma_size_;

+  // Indicates user preference for Xnack state.
+  XNACK_REQUEST xnack_;
+
+  // Indicates user preference for SramECC state.
+  FLAG_TRI_STATE sramecc_;
+
  DISALLOW_COPY_AND_ASSIGN(Flag);
 };

@@ -222,6 +222,9 @@ global:
 	hsa_amd_deregister_deallocation_callback;
 	hsa_amd_signal_value_pointer;
 	_amdgpu_r_debug;
+	hsa_amd_svm_attributes_set;
+	hsa_amd_svm_attributes_get;
+	hsa_amd_svm_prefetch_async;

 local:
    *;
@@ -82,6 +82,7 @@ extern uint8_t ocl_blit_object_gfx902[];
 extern uint8_t ocl_blit_object_gfx904[];
 extern uint8_t ocl_blit_object_gfx906[];
 extern uint8_t ocl_blit_object_gfx908[];
+extern uint8_t ocl_blit_object_gfx90a[];
 extern uint8_t ocl_blit_object_gfx1010[];
 extern uint8_t ocl_blit_object_gfx1011[];
 extern uint8_t ocl_blit_object_gfx1012[];
@@ -990,6 +991,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
    *blit_code_object = ocl_blit_object_gfx906;
  } else if (sname == "gfx908") {
    *blit_code_object = ocl_blit_object_gfx908;
+  } else if (sname == "gfx90a") {
+    *blit_code_object = ocl_blit_object_gfx90a;
  } else if (sname == "gfx1010") {
    *blit_code_object = ocl_blit_object_gfx1010;
  } else if (sname == "gfx1011") {
@@ -69,7 +69,7 @@ endif()

 # Determine the target devices if not specified
 if (NOT DEFINED TARGET_DEVICES)
-  set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
+  set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
 endif()
 set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )

@@ -50,17 +50,11 @@
 namespace rocr {
 namespace image {

-uint32_t MajorVerFromDevID(uint32_t dev_id) {
-  return dev_id/100;
-}
+uint32_t MajorVerFromDevID(uint32_t dev_id) { return dev_id >> 8; }

-uint32_t MinorVerFromDevID(uint32_t dev_id) {
-  return (dev_id % 100)/10;
-}
+uint32_t MinorVerFromDevID(uint32_t dev_id) { return (dev_id >> 4) & 0xF; }

-uint32_t StepFromDevID(uint32_t dev_id) {
-  return (dev_id%100)%10;
-}
+uint32_t StepFromDevID(uint32_t dev_id) { return dev_id & 0xF; }

 hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) {
  char asic_name[64];
@@ -78,7 +72,10 @@ hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) {
  assert(a_str.compare(0, 3, "gfx", 3) == 0);

  a_str.erase(0,3);
-  *chip_id = std::stoi(a_str);
+
+  // Load chip_id accounting for stepping and minor in hex and major in dec.
+  *chip_id = std::stoi(a_str.substr(a_str.length() - 2), nullptr, 16);
+  *chip_id += (std::stoi(a_str.substr(0, a_str.length() - 2)) << 8);
  return HSA_STATUS_SUCCESS;
 }

@@ -163,6 +160,7 @@ uint32_t DevIDToAddrLibFamily(uint32_t dev_id) {
            case 4:   // Vega12
            case 6:   // Vega20
            case 8:   // Arcturus
+            case 10:  // Aldebaran
              return FAMILY_AI;

            case 2:
@@ -115,10 +115,13 @@ enum : unsigned {
  EF_AMDGPU_MACH_AMDGCN_GFX602        = 0x03a,
  EF_AMDGPU_MACH_AMDGCN_GFX705        = 0x03b,
  EF_AMDGPU_MACH_AMDGCN_GFX805        = 0x03c,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3D = 0x03d,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3E = 0x03e,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,

  // First/last AMDGCN-based processors.
  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,

  // Indicates if the "xnack" target feature is enabled for all code contained
  // in the object.
@@ -480,7 +480,21 @@ typedef enum {
  /**
  * String containing the ROCr build identifier.
  */
-  HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200
+  HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
+  /**
+  * Returns true if hsa_amd_svm_* APIs are supported by the driver.  The type of
+  * this attribute is bool.
+  */
+  HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
+  // TODO: Should this be per Agent?
+  /**
+  * Returns true if all Agents have access to system allocated memory (such as
+  * that allocated by mmap, malloc, or new) by default.
+  * If false then system allocated memory may only be made SVM accessible to
+  * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
+  * The type of this attribute is bool.
+  */
+  HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202
 } hsa_system_info_t;

 /**
@@ -183,6 +183,9 @@ struct AmdExtTable {
  decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn;
  decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn;
  decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn;
+  decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
+  decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
+  decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
 };

 // Table to export HSA Core Runtime Apis
@@ -2116,6 +2116,162 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
 hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
                                                      hsa_amd_deallocation_callback_t callback);

+typedef enum hsa_amd_svm_model_s {
+  /**
+   * Updates to memory with this attribute conform to HSA memory consistency model.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
+  /**
+   * Writes to memory with this attribute can be performed by a single agent at a time.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1
+} hsa_amd_svm_model_t;
+
+typedef enum hsa_amd_svm_attribute_s {
+  // Memory model attribute.
+  // Type of this attribute is hsa_amd_svm_model_t.
+  HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
+  // Marks the range read only.  This allows multiple physical copies to be
+  // placed local to each accessing device.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
+  // Automatic migrations should attempt to keep the memory within the xgmi hive
+  // containing accessible agents.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
+  // Page granularity to migrate at once.  Page granularity is specified as
+  // log2(page_count).
+  // Type of this attribute is uint64_t.
+  HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
+  // Physical location to prefer when automatic migration occurs.
+  // Set to the null agent handle (handle == 0) to indicate there
+  // is no preferred location.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set (see
+  // ::hsa_amd_svm_prefetch_async).
+  // Physical location of most recent prefetch command.
+  // If the prefetch location has not been set or is not uniform across the
+  // address range then returned hsa_agent_t::handle will be 0.
+  // Querying this attribute will return the destination agent of the most
+  // recent ::hsa_amd_svm_prefetch_async targeting the address range.  If
+  // multiple async prefetches have been issued targeting the region and the
+  // most recently issued prefetch has completed then the query will return
+  // the location of the most recently completed prefetch.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range.  Access may incur a page fault
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
+  // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range without page faults.  Access
+  // will not incur a page fault and will not cause access based migration.
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
+  // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Denies an agent access to the memory range.  Access will cause a terminal
+  // segfault.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set.
+  // Returns the access attribute associated with the agent.
+  // The agent to query must be set in the attribute value field.
+  // The attribute enum will be replaced with the agent's current access
+  // attribute for the address range.
+  // TODO: Clarify KFD return value for non-uniform access attribute.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
+} hsa_amd_svm_attribute_t;
+
+// List type for hsa_amd_svm_attributes_set/get. 
+typedef struct hsa_amd_svm_attribute_pair_s {
+  // hsa_amd_svm_attribute_t value.
+  uint64_t attribute;
+  // Attribute value.  Bit values should be interpreted according to the type
+  // given in the associated attribute description.
+  uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
+/**
+ * @brief Sets SVM memory attributes.
+ *
+ * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
+ * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
+ * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
+ * memory access by that Agent.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
+ * may not be used with this API.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Gets SVM memory attributes.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
+ * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
+ * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
+ *
+ * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
+ * hsa_agent_t and returns the current access type through its attribute field.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Asynchronously migrates memory to an agent.
+ *
+ * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
+ * @p completion_signal will decrement when the migration is complete.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] agent Agent to migrate to.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the migration
+ * operation starts. The migration will start after every signal has been observed with
+ * the value 0. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the migration
+ * operation. When the migration operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. If no completion signal is required this handle may be null.
+ */
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal);
+
 #ifdef __cplusplus
 }  // end extern "C" block
 #endif
@@ -568,6 +568,7 @@ namespace code {
      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906:  name = "gfx906";  xnack_supported = true;  sramecc_supported = true;  break;
      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908:  name = "gfx908";  xnack_supported = true;  sramecc_supported = true;  break;
      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:  name = "gfx909";  xnack_supported = true;  sramecc_supported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A:  name = "gfx90a";  xnack_supported = true;  sramecc_supported = true;  break;
      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:  name = "gfx90c";  xnack_supported = true;  sramecc_supported = false; break;
      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true;  sramecc_supported = false; break;
      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true;  sramecc_supported = false; break;
@@ -80,26 +80,27 @@ namespace loader {
    : out(std::cout)
  {
    invalid.handle = 0;
-    gfx700.handle = 700;
-    gfx701.handle = 701;
-    gfx702.handle = 702;
-    gfx801.handle = 801;
-    gfx802.handle = 802;
-    gfx803.handle = 803;
-    gfx805.handle = 805;
-    gfx810.handle = 810;
-    gfx900.handle = 900;
-    gfx902.handle = 902;
-    gfx904.handle = 904;
-    gfx906.handle = 906;
-    gfx908.handle = 908;
-    gfx1010.handle = 1010;
-    gfx1011.handle = 1011;
-    gfx1012.handle = 1012;
-    gfx1030.handle = 1030;
-    gfx1031.handle = 1031;
-    gfx1032.handle = 1032;
-    gfx1033.handle = 1033;
+    gfx700.handle = 0x700;
+    gfx701.handle = 0x701;
+    gfx702.handle = 0x702;
+    gfx801.handle = 0x801;
+    gfx802.handle = 0x802;
+    gfx803.handle = 0x803;
+    gfx805.handle = 0x805;
+    gfx810.handle = 0x810;
+    gfx900.handle = 0x900;
+    gfx902.handle = 0x902;
+    gfx904.handle = 0x904;
+    gfx906.handle = 0x906;
+    gfx908.handle = 0x908;
+    gfx90a.handle = 0x90a;
+    gfx1010.handle = 0x1010;
+    gfx1011.handle = 0x1011;
+    gfx1012.handle = 0x1012;
+    gfx1030.handle = 0x1030;
+    gfx1031.handle = 0x1031;
+    gfx1032.handle = 0x1032;
+    gfx1033.handle = 0x1033;
  }

  hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name)
@@ -137,6 +138,8 @@ namespace loader {
      return gfx906;
    } else if (sname == "AMD:AMDGPU:9:0:8") {
      return gfx908;
+    } else if (sname == "AMD:AMDGPU:9:0:A") {
+      return gfx90a;
    } else if (sname == "AMD:AMDGPU:10:1:0") {
      return gfx1010;
    } else if (sname == "AMD:AMDGPU:10:1:1") {
@@ -57,7 +57,7 @@ namespace loader {
    hsa_isa_t invalid;
    hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705;
    hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810;
-    hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908;
+    hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908, gfx90a;
    hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033;
    std::ostream& out;
    typedef std::set<void*> PointerSet;