diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 08fa7b284e..7232856e63 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -49,6 +49,7 @@ #include #include #include +#include #endif // WITHOUT_HSA_BACKEND #define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) @@ -1782,12 +1783,72 @@ void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr); if (stat != HSA_STATUS_SUCCESS) { LogError("Fail hsa_amd_agents_allow_access"); + hostFree(ptr, size); return nullptr; } return ptr; } +void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomics) const { + void* ptr = nullptr; + const hsa_amd_memory_pool_t segment = + (!atomics) ? + (agentInfo.coarse_grain_pool.handle != 0) ? + agentInfo.coarse_grain_pool : agentInfo.fine_grain_pool + : agentInfo.fine_grain_pool; + assert(segment.handle != 0); + hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); + if (stat != HSA_STATUS_SUCCESS) { + LogPrintfError("Fail allocation host memory with err %d", stat); + return nullptr; + } + + stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr); + if (stat != HSA_STATUS_SUCCESS) { + LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat); + hostFree(ptr, size); + return nullptr; + } + + return ptr; +} + +void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const { + void* ptr = nullptr; + int mode = MPOL_DEFAULT; + unsigned long nodeMask = 0; + auto cpuCount = cpu_agents_.size(); + + constexpr unsigned long maxNode = sizeof(nodeMask) * 8; + long res = get_mempolicy(&mode, &nodeMask, maxNode, NULL, 0); + if (res) { + LogPrintfError("get_mempolicy failed with error %ld", res); + return ptr; + } + ClPrint(amd::LOG_INFO, amd::LOG_RESOURCE, + "get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", + mode, nodeMask, cpuCount); + + switch (mode) { + // For details, see "man get_mempolicy". + case MPOL_BIND: + case MPOL_PREFERRED: + // We only care about the first CPU node + for (unsigned int i = 0; i < cpuCount; i++) { + if ((1u << i) & nodeMask) { + ptr = hostAgentAlloc(size, cpu_agents_[i], atomics); + break; + } + } + break; + default: + // All other modes fall back to default mode + ptr = hostAlloc(size, alignment, atomics); + } + return ptr; +} + void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); } void* Device::deviceLocalAlloc(size_t size, bool atomics) const { diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 039509f26d..b32bfcffe7 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -308,7 +308,6 @@ class Device : public NullDevice { virtual hsa_agent_t getBackendDevice() const { return _bkendDevice; } const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU - static const std::vector& getGpuAgents() { return gpu_agents_; } static const std::vector& getCpuAgents() { return cpu_agents_; } @@ -389,6 +388,12 @@ class Device : public NullDevice { virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput); + //! Allocate host memory in terms of numa policy set by user + void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const; + + //! Allocate host memory from agent info + void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomics = false) const; + //! Returns transfer engine object const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); } diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index a1d864f6ba..38949e754d 100755 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -753,11 +753,10 @@ bool Buffer::create() { #else deviceMemory_ = dev().hostAlloc(size(), 1, false); #endif // AMD_HMM_SUPPORT - } else if (memFlags & CL_MEM_SVM_ATOMICS) { - deviceMemory_ = dev().hostAlloc(size(), 1, true); - } - else { - deviceMemory_ = dev().hostAlloc(size(), 1, false); + } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) { + deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0); + } else { + deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0); } } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");