SWDEV-555888 - Refactor Numa code (#1191)

1. Create a set of mini numa interface.
In Linux, the interface is based on system call rather than libnuma.
In Windows, the interface can also work, but the policy class is dummy.
Different from Linux, Windows doesn't provide numactl tool or numa lib to setup numa policy, thus
the default policy is followed in Windows, that is, using the closest host numa node to allocate
pinned host memory in hipHostMalloc().
To get the closest host numa node of a GPU device, you need query the new attribute
hipDeviceAttributeHostNumaId. Then you can create a thread with CPU affinity on the numa node.
For example, reference the test in hip-tests/catch/perftests/memory/hipPerfHostNumaAllocWin.cc.

2. Remove pfnSetThreadGroupAffinity and pfnGetNumaNodeProcessorMaskEx as the functions have been exposed since Win7 and Win server 2008.

3. Other minor fixes.
This commit is contained in:
MachineTom
2025-10-23 21:56:15 -04:00
committato da GitHub
parent 602ea0be1e
commit 5f76cb916d
18 ha cambiato i file con 631 aggiunte e 134 eliminazioni
@@ -454,6 +454,9 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
case hipDeviceAttributeMaxAvailableVgprsPerThread:
*pi = static_cast<int>(g_devices[device]->devices()[0]->info().availableVGPRs_);
break;
case hipDeviceAttributeHostNumaId:
*pi = static_cast<int>(g_devices[device]->devices()[0]->getPreferredNumaNode());
break;
default:
HIP_RETURN(hipErrorInvalidValue);
}
@@ -67,13 +67,6 @@ endif()
#target_include_directories(rocclr PRIVATE ${AMD_HSA_INCLUDE_DIR}/..)
find_package(NUMA)
if(NUMA_FOUND)
target_compile_definitions(rocclr PUBLIC ROCCLR_SUPPORT_NUMA_POLICY)
target_include_directories(rocclr PUBLIC ${NUMA_INCLUDE_DIR})
target_link_libraries(rocclr PUBLIC ${NUMA_LIBRARIES})
endif()
find_package(OpenGL REQUIRED)
target_sources(rocclr PRIVATE
+4 -1
Vedi File
@@ -2003,7 +2003,10 @@ class Device : public RuntimeObject {
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
virtual const uint32_t getPreferredNumaNode() const { return 0; }
virtual uint32_t getPreferredNumaNode() const {
return static_cast<uint32_t>(-1); //!< PAL doesn't support it
}
virtual void ReleaseGlobalSignal(void* signal) const {}
virtual const bool isFineGrainSupported() const {
return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
@@ -54,10 +54,6 @@
#include <iostream>
#include <iomanip>
#include <memory>
#ifdef ROCCLR_SUPPORT_NUMA_POLICY
#include <numa.h>
#include <numaif.h>
#endif // ROCCLR_SUPPORT_NUMA_POLICY
#include <sstream>
#include <vector>
@@ -2039,30 +2035,17 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
// ================================================================================================
void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
#else
int mode = MPOL_DEFAULT;
int maxNodes = numa_num_possible_nodes();
bitmask* nodeMask = numa_bitmask_alloc(maxNodes);
auto cpuCount = cpu_agents_.size();
long res = get_mempolicy(&mode, nodeMask->maskp, nodeMask->size, NULL, 0);
if (res) {
LogPrintfError("get_mempolicy failed with error %ld", res);
return ptr;
auto numa_node_count = cpu_agents_.size(); // count of host numa nodes
numa::NumaPolicy np(numa_node_count);
if (!np.GetMemPolicy()) {
return hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
}
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_RESOURCE,
"get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", mode,
*nodeMask->maskp, cpuCount);
switch (mode) {
// For details, see "man get_mempolicy".
case MPOL_BIND:
case MPOL_PREFERRED:
switch (np.GetPolicy()) {
case numa::NumaPolicy::Policy::kPrefered:
case numa::NumaPolicy::Policy::kBind:
// We only care about the first CPU node
for (unsigned int i = 0; i < cpuCount; i++) {
if ((1u << i) & *nodeMask->maskp) {
for (unsigned int i = 0; i < numa_node_count; i++) {
if (np.IsPolicySetAt(i)) {
ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]);
break;
}
@@ -2072,8 +2055,6 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
// All other modes fall back to default mode
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
}
numa_free_cpumask(nodeMask);
#endif // ROCCLR_SUPPORT_NUMA_POLICY
return ptr;
}
@@ -562,7 +562,7 @@ class Device : public NullDevice {
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);
const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
virtual uint32_t getPreferredNumaNode() const final { return preferred_numa_node_; }
const bool isFineGrainSupported() const;
+65 -1
Vedi File
@@ -23,6 +23,7 @@
#include "top.hpp"
#include "utils/util.hpp"
#include "utils/flags.hpp"
#include <vector>
#include <string>
@@ -232,7 +233,7 @@ class Os : AllStatic {
static void alignedFree(void* mem);
//! NUMA related settings
static void setPreferredNumaNode(uint32_t node);
inline static void setPreferredNumaNode(uint32_t node);
// File/Path helper routines:
//
@@ -526,6 +527,69 @@ inline uint Os::ThreadAffinityMask::getNextSet(uint cpu) const {
#endif
/* Mini numa interface instead of numa lib apis */
namespace numa {
static constexpr uint32_t kBitsPerUInt64 = 8 * sizeof(uint64_t);
/*! \brief Manage Numa policy.
*
* \note Works in Linux only, dummy in Windows.
*/
class NumaPolicy final {
public:
enum class Policy {
kDefault = 0,
kPrefered = 1,
kBind = 2,
kInterleave = 3,
kLocal = 4,
kPreferedMany = 5,
kWeightedInterleave = 6,
kMax = 7
};
NumaPolicy(uint32_t numa_node_count);
//! Query memory policy and node bitmask from Linux kernel
bool GetMemPolicy();
//! Check whether node_index is in bitmask for kPrefered and kBind modes
bool IsPolicySetAt(uint32_t node_index) const;
//! Return the queried policy
Policy GetPolicy() const { return policy_; }
private:
std::vector<uint64_t> node_map_{}; //!< Node bitmask for kPrefered and kBind modes
Policy policy_{Policy::kDefault}; //!< The policy
};
/*! \brief Manage Numa node.
*
* \note Works in Linux and Windows.
*/
class NumaNode final {
public:
NumaNode (uint32_t node_index): node_index_(node_index) {}
~NumaNode();
//! Apply the CPU affinity mask of the node onto the current thread
bool SchedSetAffinity();
private:
uint32_t node_index_; //! Index of the Numa node
void* affinity_ = nullptr; //!< Affinity mask of logical CPUs on this node
uint32_t size_ = 0; //!< Number of valid bits
//! Guery the affinity mask of logical CPUs on this node
bool GetAffinity();
};
} // namespace numa
inline void Os::setPreferredNumaNode(uint32_t node) {
if (AMD_CPU_AFFINITY) {
numa::NumaNode numaNode(node);
numaNode.SchedSetAffinity();
}
}
} // namespace amd
#endif /*OS_HPP_*/
+102 -22
Vedi File
@@ -19,11 +19,10 @@
THE SOFTWARE. */
#if !defined(_WIN32) && !defined(__CYGWIN__)
#include <unistd.h>
#include <sys/syscall.h>
#include "os/os.hpp"
#include "thread/thread.hpp"
#include "utils/util.hpp"
#include "utils/flags.hpp"
#include <iostream>
#include <stdarg.h>
@@ -49,11 +48,6 @@
#ifndef DT_GNU_HASH
#define DT_GNU_HASH 0x6ffffef5
#endif // DT_GNU_HASH
#ifdef ROCCLR_SUPPORT_NUMA_POLICY
#include <numa.h>
#endif // ROCCLR_SUPPORT_NUMA_POLICY
#include <atomic>
#include <vector>
#include <string>
@@ -327,20 +321,6 @@ void Os::currentStackInfo(address* base, size_t* size) {
void Os::setCurrentThreadName(const char* name) { ::prctl(PR_SET_NAME, name); }
void Os::setPreferredNumaNode(uint32_t node) {
#ifdef ROCCLR_SUPPORT_NUMA_POLICY
if (AMD_CPU_AFFINITY && (numa_available() >= 0)) {
bitmask* bm = numa_allocate_cpumask();
numa_node_to_cpus(node, bm);
if (numa_sched_setaffinity(0, bm) < 0) {
assert(0 && "failed to set affinity");
}
numa_free_cpumask(bm);
}
#endif // ROCCLR_SUPPORT_NUMA_POLICY
}
void* Thread::entry(Thread* thread) {
sigset_t set;
@@ -979,6 +959,106 @@ void Os::CxaDemangle(const std::string& name, std::string* result) {
free(demangled);
}
namespace numa {
// ================================================================================================
NumaPolicy::NumaPolicy(const uint32_t numa_node_count) :
node_map_((numa_node_count + kBitsPerUInt64 - 1) / kBitsPerUInt64, 0) { }
// ================================================================================================
bool NumaPolicy::GetMemPolicy() {
int policy = 0;
if (syscall(__NR_get_mempolicy, &policy, node_map_.data(),
node_map_.size() * kBitsPerUInt64, nullptr, 0) < 0) {
ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE,
"syscall(__NR_get_mempolicy, size=%zu) failed to query policy",
node_map_.size() * kBitsPerUInt64);
return false;
}
if (policy < static_cast<int>(Policy::kDefault) || policy > static_cast<int>(Policy::kMax)) {
ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE,
"syscall(__NR_get_mempolicy) returned wrong policy %d", policy);
return false;
}
policy_ = static_cast<Policy>(policy);
return true;
}
// ================================================================================================
bool NumaPolicy::IsPolicySetAt(uint32_t node_index) const {
const uint32_t i = node_index / kBitsPerUInt64;
if (i < node_map_.size()) {
return ((node_map_[i] >> (node_index % kBitsPerUInt64)) & 1) ?
true: false;
} else {
return false;
}
}
// ================================================================================================
NumaNode::~NumaNode() {
if (affinity_) {
delete static_cast<std::vector<uint64_t> *>(affinity_);
affinity_ = nullptr;
}
}
// ================================================================================================
bool NumaNode::GetAffinity() {
const std::string path = "/sys/devices/system/node/node" + std::to_string(node_index_) +
"/cpumap";
std::ifstream file(path);
if (!file) {
std::cerr << "Failed to open " << path << "\n";
ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE, "%s cannot be opened", path);
return false;
}
std::string line;
std::getline(file, line);
file.close();
// To remove commas and whitespace
line.erase(std::remove_if(line.begin(), line.end(),
[](unsigned char x) { return std::isspace(x) || x == ','; }), line.end());
constexpr uint32_t kHexsPerUInt64 = 2 * sizeof(uint64_t);
auto affinity = new std::vector<uint64_t>((line.size() + kHexsPerUInt64 - 1) / kHexsPerUInt64);
auto iter = affinity->begin();
// To parse from the end (little-endian layout)
for (int i = line.size(); i > 0; i -= kHexsPerUInt64) {
uint32_t start = (i >= kHexsPerUInt64) ? i - kHexsPerUInt64 : 0;
uint32_t len = (i >= kHexsPerUInt64) ? kHexsPerUInt64 : i;
const std::string chunk = line.substr(start, len);
const uint64_t value = std::stoul(chunk, nullptr, 16);
*(iter++) = value;
if (len == kHexsPerUInt64) {
size_ += kBitsPerUInt64;
} else {
// Last one
size_ = kBitsPerUInt64 - __builtin_clzl(value);
}
}
affinity_ = affinity;
return true;
}
// ================================================================================================
bool NumaNode::SchedSetAffinity() {
if (!GetAffinity()) {
return false;
}
if (syscall(__NR_sched_setaffinity, 0, size_,
static_cast<std::vector<uint64_t>*>(affinity_)->data()) < 0) {
ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE,
"syscall(__NR_sched_setaffinity, size=%u) failed", size_);
return false;
}
return true;
}
} // namespace numa
} // namespace amd
#endif // !defined(_WIN32) && !defined(__CYGWIN__)
+83 -30
Vedi File
@@ -22,7 +22,6 @@
#include "os/os.hpp"
#include "thread/thread.hpp"
#include "utils/flags.hpp"
#include <windows.h>
#include <process.h>
#include <tchar.h>
@@ -40,9 +39,6 @@
#define WINAPI
#endif
BOOL(WINAPI* pfnGetNumaNodeProcessorMaskEx)(USHORT, PGROUP_AFFINITY) = NULL;
namespace amd {
static size_t allocationGranularity_;
@@ -54,10 +50,7 @@ PVOID divExceptionHandler = NULL;
#endif // _WIN64
static double PerformanceFrequency;
typedef BOOL(WINAPI* SetThreadGroupAffinity_fn)(__in HANDLE, __in CONST GROUP_AFFINITY*,
__out_opt PGROUP_AFFINITY);
static SetThreadGroupAffinity_fn pfnSetThreadGroupAffinity = NULL;
static GROUP_AFFINITY nativeMask_;
#pragma section(".CRT$XCU", long, read)
__declspec(allocate(".CRT$XCU")) bool (*__init)(void) = Os::init;
@@ -81,12 +74,9 @@ bool Os::init() {
QueryPerformanceFrequency(&frequency);
PerformanceFrequency = (double)frequency.QuadPart;
HMODULE handle = ::LoadLibrary("kernel32.dll");
if (handle != NULL) {
pfnSetThreadGroupAffinity =
(SetThreadGroupAffinity_fn)::GetProcAddress(handle, "SetThreadGroupAffinity");
pfnGetNumaNodeProcessorMaskEx = (BOOL(WINAPI*)(USHORT, PGROUP_AFFINITY))::GetProcAddress(
handle, "GetNumaNodeProcessorMaskEx");
if (!GetThreadGroupAffinity(GetCurrentThread(), &nativeMask_)) {
ClPrint(amd::LOG_ERROR, amd::LOG_INIT, "Failed getting main thread affinity with error %d",
GetLastError());
}
return Thread::init();
@@ -250,8 +240,6 @@ static void SetThreadName(DWORD threadId, const char* name) {
void Os::setCurrentThreadName(const char* name) { SetThreadName(GetCurrentThreadId(), name); }
void Os::setPreferredNumaNode(uint32_t node) {};
static LONG WINAPI divExceptionFilter(struct _EXCEPTION_POINTERS* ep) {
DWORD code = ep->ExceptionRecord->ExceptionCode;
@@ -325,25 +313,33 @@ const void* Os::createOsThread(Thread* thread) {
return reinterpret_cast<const void*>(handle);
}
// This function only works with CPU core number <= 64.
// SetThreadGroupAffinity does clear the thread's affinity to other processor groups.
// No API yet to set multi-group affinity.
// So only the last group will take affect in this function!
void Os::setThreadAffinity(const void* handle, const Os::ThreadAffinityMask& mask) {
if (pfnSetThreadGroupAffinity != NULL) {
GROUP_AFFINITY group = {0};
for (WORD i = 0; i < sizeof(mask.mask_) / sizeof(KAFFINITY); ++i) {
group.Mask = mask.mask_[i];
group.Group = i;
if (group.Mask != 0) {
pfnSetThreadGroupAffinity((HANDLE)handle, &group, NULL);
}
}
} else { // pfnSetThreadGroupAffinity == NULL
DWORD_PTR threadAffinityMask = (DWORD_PTR)mask.mask_[0];
if (threadAffinityMask != 0) {
::SetThreadAffinityMask((HANDLE)handle, threadAffinityMask);
GROUP_AFFINITY group = {0};
for (WORD i = 0; i < sizeof(mask.mask_) / sizeof(KAFFINITY); ++i) {
group.Mask = mask.mask_[i];
group.Group = i;
if (group.Mask != 0) {
SetThreadGroupAffinity((HANDLE)handle, &group, NULL);
}
}
}
bool Os::setThreadAffinityToMainThread() { return true; }
bool Os::setThreadAffinityToMainThread() {
if (AMD_CPU_AFFINITY) {
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Setting Affinity to the main thread's affinity");
if (!SetThreadGroupAffinity(GetCurrentThread(), &nativeMask_, nullptr)) {
ClPrint(amd::LOG_ERROR, amd::LOG_INIT, "Failed setting main thread affinity with error %d",
GetLastError());
return false;
}
}
return true;
}
void Os::yield() { ::SwitchToThread(); }
uint64_t Os::timeNanos() {
@@ -751,6 +747,63 @@ bool Os::DumpCoreFile() { return false; }
// ================================================================================================
void Os::CxaDemangle(const std::string& name, std::string* result) { *result = name; }
namespace numa {
// ================================================================================================
NumaPolicy::NumaPolicy(const uint32_t numa_node_count) {
}
// ================================================================================================
bool NumaPolicy::GetMemPolicy() {
// Dummy as Windows doesn't support numa policy
return false;
}
// ================================================================================================
bool NumaPolicy::IsPolicySetAt(uint32_t node_index) const {
// Dummy as Windows doesn't support numa policy
return false;
}
// ================================================================================================
NumaNode::~NumaNode() {
if (affinity_) {
delete static_cast<GROUP_AFFINITY*>(affinity_);
affinity_ = nullptr;
}
}
// ================================================================================================
bool NumaNode::GetAffinity() {
GROUP_AFFINITY *affinity = new GROUP_AFFINITY();
if (!GetNumaNodeProcessorMaskEx(node_index_, affinity)) {
ClPrint(amd::LOG_ERROR, amd::LOG_RESOURCE,
"Failed getting numa node(%u) affinity with error %d",
node_index_, GetLastError());
delete affinity;
return false;
}
affinity_ = affinity;
return true;
}
// ================================================================================================
bool NumaNode::SchedSetAffinity() {
if (!GetAffinity()) {
return false;
}
if (!SetThreadGroupAffinity(GetCurrentThread(),
static_cast<GROUP_AFFINITY*>(affinity_), nullptr)) {
ClPrint(amd::LOG_ERROR, amd::LOG_RESOURCE,
"Failed setting numa node(%u) affinity onto thread with error %d",
node_index_, GetLastError());
return false;
}
return true;
}
} // namespace numa
} // namespace amd
#endif // _WIN32 || __CYGWIN__
@@ -39,16 +39,29 @@ set(TEST_SRC
)
if(HIP_PLATFORM MATCHES "amd")
set(TEST_SRC
${TEST_SRC}
hipPerfHostNumaAlloc.cc)
hip_add_exe_to_target(NAME perfMemoryTest
if(WIN32)
set(TEST_SRC ${TEST_SRC} hipPerfHostNumaAllocWin.cc)
hip_add_exe_to_target(NAME perfMemoryTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME perf_test)
else()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
find_package(NUMA)
if(NUMA_FOUND)
# Test code still use libnuma apis
set(TEST_SRC ${TEST_SRC} hipPerfHostNumaAlloc.cc)
hip_add_exe_to_target(NAME perfMemoryTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME perf_test
LINKER_LIBS numa)
else()
hip_add_exe_to_target(NAME perfMemoryTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME perf_test)
endif()
endif()
else()
hip_add_exe_to_target(NAME perfMemoryTest
hip_add_exe_to_target(NAME perfMemoryTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME perf_test)
endif()
@@ -63,7 +63,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) {
unsigned int numIter;
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
int test = 0;
// 1. Run all P2P for all sizes
if (numDevices >= 2) {
for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) {
@@ -136,7 +135,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) {
CONSOLE_PRINT(
"HIPPerfBufferCopySpeed[%3d] (%10u bytes) P2P s:%-5s d:%-5s i:%4u (GB/s) perf %f",
testIdx, bufSize_, "dev0", "dev1", numIter, (float)perf);
test++;
void* temp = malloc(bufSize_ + 4096);
void* chkBuf = reinterpret_cast<void*>(temp);
HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault));
@@ -192,7 +190,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) {
CONSOLE_PRINT(
"HIPPerfBufferCopySpeed[%3d] (%10u bytes) NoCU s:%-5s d:%-5s i:%4u (GB/s) perf %f",
testIdx, bufSize_, "dev0", "dev0", numIter, (float)perf);
test++;
void* temp = malloc(bufSize_ + 4096);
void* chkBuf = reinterpret_cast<void*>(temp);
HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault));
@@ -337,7 +334,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) {
CONSOLE_PRINT(
"HIPPerfBufferCopySpeed[%3d] (%10u bytes) %-5s s:%-5s d:%-5s i:%4u (GB/s) perf %f",
testIdx, bufSize_, " ", strSrc, strDst, numIter, (float)perf);
test++;
void* temp = malloc(bufSize_ + 4096);
void* chkBuf = reinterpret_cast<void*>(temp);
HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault));
@@ -96,7 +96,7 @@ static bool hipPerfDevMemReadSpeed_test() {
if (hDst[0] != (nBytes / sizeof(uint))) {
DEBUG_PRINT(
"hipPerfDevMemReadSpeed - Data validation failed for warm up run! expected %lu got %u\n",
"hipPerfDevMemReadSpeed - Data validation failed for warm up run! expected %zu got %u\n",
nBytes / sizeof(uint), hDst[0]);
return false;
}
@@ -24,41 +24,23 @@ THE SOFTWARE.
* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
* Copies data between host and device.
*/
#include <unistd.h>
#include <numaif.h>
#include <numa.h>
#include <hip_test_common.hh>
// #define ENABLE_DEBUG 1
// To run it correctly, we must not export HIP_VISIBLE_DEVICES.
// And we must explicitly link libnuma because of numa api move_pages().
#define NUM_PAGES 4
#define NUM_PAGES 100
char* h = nullptr;
char* d_h = nullptr;
char* m = nullptr;
char* d_m = nullptr;
int page_size = 1024;
int page_size = 0;
const int mode[] = {MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE};
const char* modeStr[] = {"MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE"};
std::string exeCommand(const char* cmd) {
std::array<char, 128> buff;
std::string result;
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
if (!pipe) {
return result;
}
while (fgets(buff.data(), buff.size(), pipe.get()) != nullptr) {
result += buff.data();
}
return result;
}
int getCpuAgentCount() {
const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l";
int cpuAgentCount = std::atoi(exeCommand(cmd).c_str());
return cpuAgentCount;
}
bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
void* pages[NUM_PAGES];
int status[NUM_PAGES];
@@ -66,10 +48,14 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
CONSOLE_PRINT("set cpu %d, gpu %d, numaMode %d, hostMallocflags %u\n", cpuId, gpuId, numaMode,
hostMallocflags);
if (gpuId >= 0) {
HIP_CHECK(hipSetDevice(gpuId));
}
if (cpuId >= 0) {
unsigned long nodeMask = 1 << cpuId; // NOLINT
unsigned long maxNode = sizeof(nodeMask) * 8; // NOLINT
// Will override existing numa policy in memory
if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask,
numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) {
WARN("set_mempolicy() failed with err " << errno << "\n");
@@ -77,10 +63,6 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
}
}
if (gpuId >= 0) {
HIP_CHECK(hipSetDevice(gpuId));
}
posix_memalign(reinterpret_cast<void**>(&m), page_size, page_size * NUM_PAGES);
HIP_CHECK(hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped));
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&d_m), m, 0));
@@ -164,8 +146,9 @@ bool runTest(const int& cpuCount, const int& gpuCount, unsigned int hostMallocfl
TEST_CASE("Perf_hipPerfHostNumaAlloc_test") {
int gpuCount = 0;
HIP_CHECK(hipGetDeviceCount(&gpuCount));
int cpuCount = getCpuAgentCount();
CONSOLE_PRINT("Cpu count %d, Gpu count %d\n", cpuCount, gpuCount);
int cpuCount = numa_max_node() + 1; // number of numa nodes
page_size = getpagesize();
CONSOLE_PRINT("Cpu count %d, Gpu count %d, page_size %d\n", cpuCount, gpuCount, page_size);
if (cpuCount < 0 || gpuCount < 0) {
SUCCEED(
@@ -0,0 +1,326 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @addtogroup hipHostMalloc
* @{
* @ingroup hipHostMalloc
* `hipHostMalloc(T** ptr, size_t size, unsigned int flags)` -
* Allocate pinned host buffer.
*/
#include <hip_test_common.hh>
#include <windows.h>
#include <processtopologyapi.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <psapi.h>
#include <tchar.h>
SIZE_T allocSize = 1024 * 1024; // 1 MB
DWORD pageSize = 0;
struct NumaNodeInfo {
WORD nodeNumber;
WORD groupNumber;
KAFFINITY mask;
ULONGLONG freeBytes;
};
enum class MallocType {
hostMallocType = 0,
hiphostMallocType = 1,
};
struct ThreadPara {
ThreadPara(NumaNodeInfo* node_ = nullptr, MallocType mallocType_ = MallocType::hostMallocType,
unsigned int flags_ = 0, int deviceId_= -1) {
node = node_;
mallocType = mallocType_;
flags = flags_;
deviceId = deviceId_;
}
NumaNodeInfo *node;
MallocType mallocType;
unsigned int flags; // flags to allocate buffer
int deviceId; // for MGPUs test
};
bool checkNumaNodeInfo(const PVOID buffer, const SIZE_T bufferSize, const WORD nodeNumber,
bool pinned = false) {
DWORD_PTR startPage = ((DWORD_PTR)buffer) / pageSize;
DWORD_PTR endPage = ((DWORD_PTR)buffer + bufferSize - 1) / pageSize;
DWORD_PTR numPages = (endPage - startPage) + 1;
PCHAR startPtr = (PCHAR)(pageSize * startPage);
PPSAPI_WORKING_SET_EX_INFORMATION wsInfo = static_cast<PPSAPI_WORKING_SET_EX_INFORMATION>(
malloc(numPages * sizeof(PSAPI_WORKING_SET_EX_INFORMATION)));
if (wsInfo == NULL) {
std::cerr <<"Could not allocate array of PSAPI_WORKING_SET_EX_INFORMATION structures\n";
return false;
}
for (DWORD_PTR i = 0; i < numPages; i++) {
wsInfo[i].VirtualAddress = startPtr + i * pageSize;
}
BOOL bResult = QueryWorkingSetEx(GetCurrentProcess(), wsInfo,
(DWORD)numPages * sizeof(PSAPI_WORKING_SET_EX_INFORMATION));
if (!bResult) {
std::cerr <<"QueryWorkingSetEx failed: " << GetLastError() << "\n";
free(wsInfo);
return false;
}
bool ret = true;
for (DWORD_PTR i = 0; i < numPages; i++) {
BOOL IsValid = wsInfo[i].VirtualAttributes.Valid;
DWORD Node = wsInfo[i].VirtualAttributes.Node;
if (pinned) {
if (!IsValid ) {
std::cerr << "Page " << i << " is invalid\n";
ret = false;
break;
} else if (nodeNumber != Node) {
std::cerr << "Page " << i << " has node " << Node << " not matching expected " << nodeNumber << "\n";
ret = false;
break;
}
} else if (IsValid && nodeNumber != Node) {
// maybe IsValid = false for unpinned
std::cerr << "Page " << i << " has node " << Node << " not matching expected " << nodeNumber << "\n";
ret = false;
break;
}
}
free(wsInfo);
return ret;
}
void enumerateNumaNodes(std::vector<NumaNodeInfo> &nodes) {
DWORD len = 0;
GetLogicalProcessorInformationEx(RelationNumaNode, nullptr, &len);
std::vector<BYTE> buffer(len);
if (!GetLogicalProcessorInformationEx(RelationNumaNode,
reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()),
&len)) {
std::cerr << "GetLogicalProcessorInformationEx failed. Error: " << GetLastError() << "\n";
return;
}
BYTE* ptr = buffer.data();
while (ptr < buffer.data() + len) {
auto info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(ptr);
if (info->Relationship == RelationNumaNode) {
NUMA_NODE_RELATIONSHIP *numaRelation = &info->NumaNode;
NumaNodeInfo node{};
node.nodeNumber = static_cast<WORD>(numaRelation->NodeNumber);
if (!GetNumaAvailableMemoryNodeEx(node.nodeNumber, &node.freeBytes)) {
std::cerr << "GetNumaAvailableMemoryNodeEx(" << node.nodeNumber <<
") failed with Error: " << GetLastError() << "\n";
continue;
}
if (numaRelation->GroupCount == 0) {
// Before Windows 20H2
node.groupNumber = numaRelation->GroupMask.Group;
node.mask = numaRelation->GroupMask.Mask;
nodes.push_back(node);
} else {
// Since Windows 20H2.
GROUP_AFFINITY *groupMasks = numaRelation->GroupMasks;
for (int i = 0; i < numaRelation->GroupCount; i++) {
node.groupNumber = groupMasks[i].Group;
node.mask = groupMasks[i].Mask;
nodes.push_back(node);
}
}
}
ptr += info->Size;
}
}
static DWORD WINAPI workerThread(LPVOID lpParam) {
ThreadPara *threadPara = reinterpret_cast<ThreadPara*>(lpParam);
NumaNodeInfo* node = threadPara->node;
PROCESSOR_NUMBER procNumber{};
GetCurrentProcessorNumberEx(&procNumber);
std::cout << "Thread is running on processor number: " << static_cast<int>(procNumber.Number)
<< " in group: " << procNumber.Group << std::endl;
USHORT runNode = -1;
if (GetNumaProcessorNodeEx(&procNumber, &runNode)) {
std::cout << "Thread is running on NUMA node " << runNode << "\n";
}
else {
std::cerr << "Failed to get NUMA node. Error: " << GetLastError() << std::endl;
return -1;
}
if (static_cast<USHORT>(node->nodeNumber) != runNode) {
std::cerr << "runNode " << runNode << "not matching node->nodeNumber " <<
node->nodeNumber <<"\n";
return -1;
}
if (threadPara->deviceId >= 0) {
HIP_CHECK(hipSetDevice(threadPara->deviceId)); // doesn't matter AMD_CPU_AFFINITY is 1 or 0
}
void* pMem = nullptr;
switch (threadPara->mallocType) {
case MallocType::hostMallocType:
// Place holder
pMem = VirtualAllocExNuma(GetCurrentProcess(),
nullptr,
allocSize,
threadPara->flags,
PAGE_READWRITE,
node->nodeNumber);
break;
case MallocType::hiphostMallocType:
HIP_CHECK(hipHostMalloc(&pMem, allocSize, threadPara->flags));
break;
default:
return -1;
}
if (!pMem) {
std::cerr << "NUMA allocation failed for thread "
<< GetCurrentThreadId() << "on NUMA node " << node->nodeNumber <<
" with Error: " << GetLastError() << "\n";
return -1;
}
memset(pMem, 0xCD, allocSize);
bool ret = checkNumaNodeInfo(pMem, allocSize, node->nodeNumber,
threadPara->mallocType != MallocType::hostMallocType);
switch (threadPara->mallocType) {
case MallocType::hostMallocType:
VirtualFree(pMem, 0, MEM_RELEASE);
break;
case MallocType::hiphostMallocType:
HIP_CHECK(hipHostFree(pMem));
break;
default:
return -1;
}
return ret ? 0 : -1;
}
static void runTestPrefered(std::vector<NumaNodeInfo> &nodes, MallocType type, unsigned int flags,
const char *description) {
int gpuCount = 0;
HIP_CHECK(hipGetDeviceCount(&gpuCount));
std::cout << std::dec;
std::vector<HANDLE> threadHandles;
std::vector<ThreadPara> paras;
paras.reserve(gpuCount * nodes.size());
int index = 0;
for (int dev = 0; dev < gpuCount; dev++) {
int numaNode = -1;
HIP_CHECK(hipDeviceGetAttribute(&numaNode, hipDeviceAttributeHostNumaId, dev));
if (numaNode == -1) {
continue; // Impossible here
}
for (auto& node : nodes) {
if (numaNode != node.nodeNumber) {
continue;
}
if (node.freeBytes < allocSize) {
std::cerr << "node.freeBytes " << node.freeBytes <<" < allocSize " << allocSize << "\n";
continue;
}
// For best perf, we prefer creating a thread on the host numa node of the gpu device.
auto& ref = paras.emplace_back(&node, type, flags, dev);
HANDLE hThread = CreateThread(nullptr, 0, workerThread, &ref, CREATE_SUSPENDED, nullptr);
if (!hThread) {
std::cerr << "Thread creation failed. Error: " << GetLastError() << "\n";
continue;
}
GROUP_AFFINITY ga = {};
ga.Group = node.groupNumber;
ga.Mask = node.mask;
GROUP_AFFINITY prev = {};
if (!SetThreadGroupAffinity(hThread, &ga, &prev)) {
std::cerr << "SetThreadGroupAffinity failed. Error: " << GetLastError() << "\n";
CloseHandle(hThread);
continue;
}
std::cout << "dev " << dev << ", thread " << index++ << ": Group: " << ga.Group <<
", Mask: " << std::hex << ga.Mask << "; prev: Group: " << std::dec << prev.Group <<
", Mask: " << std::hex << prev.Mask << std::dec <<"\n";
ResumeThread(hThread);
threadHandles.push_back(hThread);
// A single NUMA node can span multiple processor groups on systems with more than 64 processors,
// so we will continue searching for next node of the same nodeNumber.
}
}
// Wait for all threads
WaitForMultipleObjects((DWORD)threadHandles.size(), threadHandles.data(), TRUE, INFINITE);
bool result = true;
for (auto h : threadHandles) {
DWORD exitCode = 0;
if (GetExitCodeThread(h, &exitCode)) {
result &= (exitCode == 0);
} else {
result = false;
}
CloseHandle(h);
}
std::cout << description << (result ? " passed\n" : " failed\n");
REQUIRE(result);
}
/* Test memory allocation on preferred host numa node on each CPU */
TEST_CASE("Perf_hipPerfHostNumaAlloc_test_preferred_host_numa_node_on_each_GPU") {
std::vector<NumaNodeInfo> nodes;
enumerateNumaNodes(nodes);
if (nodes.empty()) {
std::cerr << "No NUMA nodes found.\n";
REQUIRE(false);
}
SYSTEM_INFO systemInfo;
GetSystemInfo(&systemInfo);
pageSize = systemInfo.dwPageSize;
std::cout << "logic processor count " << systemInfo.dwNumberOfProcessors
<< ", page size " << pageSize << "\n";
int numaNode = -1;
HIP_CHECK(hipDeviceGetAttribute(&numaNode, hipDeviceAttributeHostNumaId, 0));
if (numaNode == -1) {
HipTest::HIP_SKIP_TEST("Host NUMA isn't supported hence skipping the test...\n");
return;
}
HIP_CHECK(hipSetDevice(0));
// In windows, it is the same with / without hipHostMallocNumaUser
runTestPrefered(nodes,
MallocType::hiphostMallocType, hipHostMallocDefault | hipHostMallocNumaUser,
"hiphostMalloc(hipHostMallocDefault | hipHostMallocNumaUser) on preferred numa node");
runTestPrefered(nodes,
MallocType::hiphostMallocType, hipHostAllocMapped | hipHostMallocNumaUser,
"hiphostMalloc(hipHostAllocMapped | hipHostMallocNumaUser) on preferred numa node");
runTestPrefered(nodes,
MallocType::hostMallocType, MEM_RESERVE | MEM_COMMIT,
"VirtualAllocExNuma(MEM_RESERVE | MEM_COMMIT) on preferred numa node");
}
/**
* End doxygen group hipHostMalloc.
* @}
*/
@@ -82,11 +82,8 @@ void hipPerfMemcpy::TestResult(unsigned int numTests,
perf *= 2.0;
}
CONSOLE_PRINT("hipPerfMemcpy[%d] %s copy BW %.2f GB/s for memory size of %lu Bytes.\n", numTests,
CONSOLE_PRINT("hipPerfMemcpy[%d] %s copy BW %.2f GB/s for memory size of %zu Bytes.\n", numTests,
typestr, perf, totalSizes_[numTests]);
if (totalSizes_[numTests] == 4194304 && type == hipMemcpyDeviceToDeviceNoCU)
REQUIRE(perf < NOCU_MAX_BW);
}
bool hipPerfMemcpy::run_h2d(unsigned int numTests) {
@@ -335,7 +335,7 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type,
auto sec = diff.count();
auto perf = static_cast<double>((sizeElements * NUM_ITER * (1e-09)) / sec);
CONSOLE_PRINT("hipPerf3DMemset%s[%d] (GB/s) for %5lu x %5lu x %lu bytes : %7.2f\n",
CONSOLE_PRINT("hipPerf3DMemset%s[%d] (GB/s) for %5zu x %5zu x %zu bytes : %7.2f\n",
(async ? "Async" : " "), test, bufSize_, bufSize_, depth, perf);
HIP_CHECK(hipFree(devPitchedPtr.ptr));
free(A_h);
@@ -557,6 +557,8 @@ typedef enum hipDeviceAttribute_t {
///< hipHostRegister
hipDeviceAttributeMemoryPoolSupportedHandleTypes, ///< Supported handle mask for HIP Stream
///< Ordered Memory Allocator
hipDeviceAttributeHostNumaId, ///< NUMA ID of the cpu node closest to the device,
///< or -1 when NUMA isn't supported
hipDeviceAttributeCudaCompatibleEnd = 9999,
hipDeviceAttributeAmdSpecificBegin = 10000,
@@ -2978,6 +2978,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
case hipDeviceAttributeMemoryPoolSupportedHandleTypes:
cdattr = cudaDevAttrMemoryPoolSupportedHandleTypes;
break;
case hipDeviceAttributeHostNumaId:
cdattr = cudaDevAttrHostNumaId;
break;
default:
return hipCUDAErrorTohipError(cudaErrorInvalidValue);
}