diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 08fa7b284e..7232856e63 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -49,6 +49,7 @@
 #include <iostream>
 #include <vector>
 #include <algorithm>
+#include <numaif.h>
 #endif  // WITHOUT_HSA_BACKEND
 
 #define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR)
@@ -1782,12 +1783,72 @@ void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
   stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
   if (stat != HSA_STATUS_SUCCESS) {
     LogError("Fail hsa_amd_agents_allow_access");
+    hostFree(ptr, size);
     return nullptr;
   }
 
   return ptr;
 }
 
+void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomics) const {
+  void* ptr = nullptr;
+  const hsa_amd_memory_pool_t segment =
+      (!atomics) ?
+          (agentInfo.coarse_grain_pool.handle != 0) ?
+              agentInfo.coarse_grain_pool : agentInfo.fine_grain_pool
+          : agentInfo.fine_grain_pool;
+  assert(segment.handle != 0);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
+  if (stat != HSA_STATUS_SUCCESS) {
+    LogPrintfError("Fail allocation host memory with err %d", stat);
+    return nullptr;
+  }
+
+  stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
+  if (stat != HSA_STATUS_SUCCESS) {
+    LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat);
+    hostFree(ptr, size);
+    return nullptr;
+  }
+
+  return ptr;
+}
+
+void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
+  void* ptr = nullptr;
+  int mode = MPOL_DEFAULT;
+  unsigned long nodeMask = 0;
+  auto cpuCount = cpu_agents_.size();
+
+  constexpr unsigned long maxNode = sizeof(nodeMask) * 8;
+  long res = get_mempolicy(&mode, &nodeMask, maxNode, NULL, 0);
+  if (res) {
+    LogPrintfError("get_mempolicy failed with error %ld", res);
+    return ptr;
+  }
+  ClPrint(amd::LOG_INFO, amd::LOG_RESOURCE,
+          "get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu",
+          mode, nodeMask, cpuCount);
+
+  switch (mode) {
+    // For details, see "man get_mempolicy".
+    case MPOL_BIND:
+    case MPOL_PREFERRED:
+      // We only care about the first CPU node
+      for (unsigned int i = 0; i < cpuCount; i++) {
+        if ((1u << i) & nodeMask) {
+          ptr = hostAgentAlloc(size, cpu_agents_[i], atomics);
+          break;
+        }
+      }
+      break;
+    default:
+      //  All other modes fall back to default mode
+      ptr = hostAlloc(size, alignment, atomics);
+  }
+  return ptr;
+}
+
 void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
 
 void* Device::deviceLocalAlloc(size_t size, bool atomics) const {
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 039509f26d..b32bfcffe7 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -308,7 +308,6 @@ class Device : public NullDevice {
   virtual hsa_agent_t getBackendDevice() const { return _bkendDevice; }
   const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU
 
-
   static const std::vector<hsa_agent_t>& getGpuAgents() { return gpu_agents_; }
   static const std::vector<AgentInfo>& getCpuAgents() { return cpu_agents_; }
 
@@ -389,6 +388,12 @@ class Device : public NullDevice {
   virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
+  //! Allocate host memory in terms of numa policy set by user
+  void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const;
+
+  //! Allocate host memory from agent info
+  void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomics = false) const;
+
   //! Returns transfer engine object
   const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
 
diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp
index a1d864f6ba..38949e754d 100755
--- a/rocclr/device/rocm/rocmemory.cpp
+++ b/rocclr/device/rocm/rocmemory.cpp
@@ -753,11 +753,10 @@ bool Buffer::create() {
 #else
           deviceMemory_ = dev().hostAlloc(size(), 1, false);
 #endif // AMD_HMM_SUPPORT
-        } else if (memFlags & CL_MEM_SVM_ATOMICS) {
-          deviceMemory_ = dev().hostAlloc(size(), 1, true);
-        }
-        else {
-          deviceMemory_ = dev().hostAlloc(size(), 1, false);
+        } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
+          deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
+        } else {
+          deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
         }
       } else {
         assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");