diff --git a/projects/clr/hipamd/src/hip_device_runtime.cpp b/projects/clr/hipamd/src/hip_device_runtime.cpp index c277b4d83f..adb58a63ec 100644 --- a/projects/clr/hipamd/src/hip_device_runtime.cpp +++ b/projects/clr/hipamd/src/hip_device_runtime.cpp @@ -454,6 +454,9 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) case hipDeviceAttributeMaxAvailableVgprsPerThread: *pi = static_cast(g_devices[device]->devices()[0]->info().availableVGPRs_); break; + case hipDeviceAttributeHostNumaId: + *pi = static_cast(g_devices[device]->devices()[0]->getPreferredNumaNode()); + break; default: HIP_RETURN(hipErrorInvalidValue); } diff --git a/projects/clr/rocclr/cmake/ROCclrHSA.cmake b/projects/clr/rocclr/cmake/ROCclrHSA.cmake index d28ca82e47..f7e2ada661 100644 --- a/projects/clr/rocclr/cmake/ROCclrHSA.cmake +++ b/projects/clr/rocclr/cmake/ROCclrHSA.cmake @@ -67,13 +67,6 @@ endif() #target_include_directories(rocclr PRIVATE ${AMD_HSA_INCLUDE_DIR}/..) -find_package(NUMA) -if(NUMA_FOUND) - target_compile_definitions(rocclr PUBLIC ROCCLR_SUPPORT_NUMA_POLICY) - target_include_directories(rocclr PUBLIC ${NUMA_INCLUDE_DIR}) - target_link_libraries(rocclr PUBLIC ${NUMA_LIBRARIES}) -endif() - find_package(OpenGL REQUIRED) target_sources(rocclr PRIVATE diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index a367df0e7b..4afc15b429 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -2003,7 +2003,10 @@ class Device : public RuntimeObject { virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; - virtual const uint32_t getPreferredNumaNode() const { return 0; } + virtual uint32_t getPreferredNumaNode() const { + return static_cast(-1); //!< PAL doesn't support it + } + virtual void ReleaseGlobalSignal(void* signal) const {} virtual const bool isFineGrainSupported() const { return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false; diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 2a21335698..4270b4b796 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -54,10 +54,6 @@ #include #include #include -#ifdef ROCCLR_SUPPORT_NUMA_POLICY -#include -#include -#endif // ROCCLR_SUPPORT_NUMA_POLICY #include #include @@ -2039,30 +2035,17 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg, // ================================================================================================ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { void* ptr = nullptr; -#ifndef ROCCLR_SUPPORT_NUMA_POLICY - ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_); -#else - int mode = MPOL_DEFAULT; - int maxNodes = numa_num_possible_nodes(); - bitmask* nodeMask = numa_bitmask_alloc(maxNodes); - auto cpuCount = cpu_agents_.size(); - - long res = get_mempolicy(&mode, nodeMask->maskp, nodeMask->size, NULL, 0); - if (res) { - LogPrintfError("get_mempolicy failed with error %ld", res); - return ptr; + auto numa_node_count = cpu_agents_.size(); // count of host numa nodes + numa::NumaPolicy np(numa_node_count); + if (!np.GetMemPolicy()) { + return hostAlloc(size, alignment, mem_seg, cpu_agent_info_); } - ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_RESOURCE, - "get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", mode, - *nodeMask->maskp, cpuCount); - - switch (mode) { - // For details, see "man get_mempolicy". - case MPOL_BIND: - case MPOL_PREFERRED: + switch (np.GetPolicy()) { + case numa::NumaPolicy::Policy::kPrefered: + case numa::NumaPolicy::Policy::kBind: // We only care about the first CPU node - for (unsigned int i = 0; i < cpuCount; i++) { - if ((1u << i) & *nodeMask->maskp) { + for (unsigned int i = 0; i < numa_node_count; i++) { + if (np.IsPolicySetAt(i)) { ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]); break; } @@ -2072,8 +2055,6 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg // All other modes fall back to default mode ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_); } - numa_free_cpumask(nodeMask); -#endif // ROCCLR_SUPPORT_NUMA_POLICY return ptr; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 02559d49a1..e233524a96 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -562,7 +562,7 @@ class Device : public NullDevice { virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0); - const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; } + virtual uint32_t getPreferredNumaNode() const final { return preferred_numa_node_; } const bool isFineGrainSupported() const; diff --git a/projects/clr/rocclr/os/os.hpp b/projects/clr/rocclr/os/os.hpp index a65d454fdf..3f5ccf07e3 100644 --- a/projects/clr/rocclr/os/os.hpp +++ b/projects/clr/rocclr/os/os.hpp @@ -23,6 +23,7 @@ #include "top.hpp" #include "utils/util.hpp" +#include "utils/flags.hpp" #include #include @@ -232,7 +233,7 @@ class Os : AllStatic { static void alignedFree(void* mem); //! NUMA related settings - static void setPreferredNumaNode(uint32_t node); + inline static void setPreferredNumaNode(uint32_t node); // File/Path helper routines: // @@ -526,6 +527,69 @@ inline uint Os::ThreadAffinityMask::getNextSet(uint cpu) const { #endif +/* Mini numa interface instead of numa lib apis */ +namespace numa { + +static constexpr uint32_t kBitsPerUInt64 = 8 * sizeof(uint64_t); + +/*! \brief Manage Numa policy. + * + * \note Works in Linux only, dummy in Windows. + */ +class NumaPolicy final { +public: + enum class Policy { + kDefault = 0, + kPrefered = 1, + kBind = 2, + kInterleave = 3, + kLocal = 4, + kPreferedMany = 5, + kWeightedInterleave = 6, + kMax = 7 + }; + NumaPolicy(uint32_t numa_node_count); + + //! Query memory policy and node bitmask from Linux kernel + bool GetMemPolicy(); + + //! Check whether node_index is in bitmask for kPrefered and kBind modes + bool IsPolicySetAt(uint32_t node_index) const; + + //! Return the queried policy + Policy GetPolicy() const { return policy_; } +private: + std::vector node_map_{}; //!< Node bitmask for kPrefered and kBind modes + Policy policy_{Policy::kDefault}; //!< The policy +}; + +/*! \brief Manage Numa node. + * + * \note Works in Linux and Windows. + */ +class NumaNode final { +public: + NumaNode (uint32_t node_index): node_index_(node_index) {} + ~NumaNode(); + //! Apply the CPU affinity mask of the node onto the current thread + bool SchedSetAffinity(); +private: + uint32_t node_index_; //! Index of the Numa node + void* affinity_ = nullptr; //!< Affinity mask of logical CPUs on this node + uint32_t size_ = 0; //!< Number of valid bits + //! Guery the affinity mask of logical CPUs on this node + bool GetAffinity(); +}; + +} // namespace numa + +inline void Os::setPreferredNumaNode(uint32_t node) { + if (AMD_CPU_AFFINITY) { + numa::NumaNode numaNode(node); + numaNode.SchedSetAffinity(); + } +} + } // namespace amd #endif /*OS_HPP_*/ diff --git a/projects/clr/rocclr/os/os_posix.cpp b/projects/clr/rocclr/os/os_posix.cpp index feb22ae931..a3766a71fd 100644 --- a/projects/clr/rocclr/os/os_posix.cpp +++ b/projects/clr/rocclr/os/os_posix.cpp @@ -19,11 +19,10 @@ THE SOFTWARE. */ #if !defined(_WIN32) && !defined(__CYGWIN__) - +#include +#include #include "os/os.hpp" #include "thread/thread.hpp" -#include "utils/util.hpp" -#include "utils/flags.hpp" #include #include @@ -49,11 +48,6 @@ #ifndef DT_GNU_HASH #define DT_GNU_HASH 0x6ffffef5 #endif // DT_GNU_HASH - -#ifdef ROCCLR_SUPPORT_NUMA_POLICY -#include -#endif // ROCCLR_SUPPORT_NUMA_POLICY - #include #include #include @@ -327,20 +321,6 @@ void Os::currentStackInfo(address* base, size_t* size) { void Os::setCurrentThreadName(const char* name) { ::prctl(PR_SET_NAME, name); } -void Os::setPreferredNumaNode(uint32_t node) { -#ifdef ROCCLR_SUPPORT_NUMA_POLICY - if (AMD_CPU_AFFINITY && (numa_available() >= 0)) { - bitmask* bm = numa_allocate_cpumask(); - numa_node_to_cpus(node, bm); - if (numa_sched_setaffinity(0, bm) < 0) { - assert(0 && "failed to set affinity"); - } - - numa_free_cpumask(bm); - } -#endif // ROCCLR_SUPPORT_NUMA_POLICY -} - void* Thread::entry(Thread* thread) { sigset_t set; @@ -979,6 +959,106 @@ void Os::CxaDemangle(const std::string& name, std::string* result) { free(demangled); } +namespace numa { + +// ================================================================================================ +NumaPolicy::NumaPolicy(const uint32_t numa_node_count) : + node_map_((numa_node_count + kBitsPerUInt64 - 1) / kBitsPerUInt64, 0) { } + +// ================================================================================================ +bool NumaPolicy::GetMemPolicy() { + int policy = 0; + if (syscall(__NR_get_mempolicy, &policy, node_map_.data(), + node_map_.size() * kBitsPerUInt64, nullptr, 0) < 0) { + ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE, + "syscall(__NR_get_mempolicy, size=%zu) failed to query policy", + node_map_.size() * kBitsPerUInt64); + return false; + } + if (policy < static_cast(Policy::kDefault) || policy > static_cast(Policy::kMax)) { + ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE, + "syscall(__NR_get_mempolicy) returned wrong policy %d", policy); + return false; + } + policy_ = static_cast(policy); + return true; +} + +// ================================================================================================ +bool NumaPolicy::IsPolicySetAt(uint32_t node_index) const { + const uint32_t i = node_index / kBitsPerUInt64; + if (i < node_map_.size()) { + return ((node_map_[i] >> (node_index % kBitsPerUInt64)) & 1) ? + true: false; + } else { + return false; + } +} + +// ================================================================================================ +NumaNode::~NumaNode() { + if (affinity_) { + delete static_cast *>(affinity_); + affinity_ = nullptr; + } +} + +// ================================================================================================ +bool NumaNode::GetAffinity() { + const std::string path = "/sys/devices/system/node/node" + std::to_string(node_index_) + + "/cpumap"; + std::ifstream file(path); + if (!file) { + std::cerr << "Failed to open " << path << "\n"; + ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE, "%s cannot be opened", path); + return false; + } + std::string line; + std::getline(file, line); + file.close(); + + // To remove commas and whitespace + line.erase(std::remove_if(line.begin(), line.end(), + [](unsigned char x) { return std::isspace(x) || x == ','; }), line.end()); + + constexpr uint32_t kHexsPerUInt64 = 2 * sizeof(uint64_t); + auto affinity = new std::vector((line.size() + kHexsPerUInt64 - 1) / kHexsPerUInt64); + auto iter = affinity->begin(); + // To parse from the end (little-endian layout) + for (int i = line.size(); i > 0; i -= kHexsPerUInt64) { + uint32_t start = (i >= kHexsPerUInt64) ? i - kHexsPerUInt64 : 0; + uint32_t len = (i >= kHexsPerUInt64) ? kHexsPerUInt64 : i; + + const std::string chunk = line.substr(start, len); + const uint64_t value = std::stoul(chunk, nullptr, 16); + *(iter++) = value; + if (len == kHexsPerUInt64) { + size_ += kBitsPerUInt64; + } else { + // Last one + size_ = kBitsPerUInt64 - __builtin_clzl(value); + } + } + affinity_ = affinity; + return true; +} + +// ================================================================================================ +bool NumaNode::SchedSetAffinity() { + if (!GetAffinity()) { + return false; + } + if (syscall(__NR_sched_setaffinity, 0, size_, + static_cast*>(affinity_)->data()) < 0) { + ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE, + "syscall(__NR_sched_setaffinity, size=%u) failed", size_); + return false; + } + return true; +} + +} // namespace numa + } // namespace amd #endif // !defined(_WIN32) && !defined(__CYGWIN__) diff --git a/projects/clr/rocclr/os/os_win32.cpp b/projects/clr/rocclr/os/os_win32.cpp index 67d55a21c6..c80c1e4d28 100644 --- a/projects/clr/rocclr/os/os_win32.cpp +++ b/projects/clr/rocclr/os/os_win32.cpp @@ -22,7 +22,6 @@ #include "os/os.hpp" #include "thread/thread.hpp" -#include "utils/flags.hpp" #include #include #include @@ -40,9 +39,6 @@ #define WINAPI #endif - -BOOL(WINAPI* pfnGetNumaNodeProcessorMaskEx)(USHORT, PGROUP_AFFINITY) = NULL; - namespace amd { static size_t allocationGranularity_; @@ -54,10 +50,7 @@ PVOID divExceptionHandler = NULL; #endif // _WIN64 static double PerformanceFrequency; - -typedef BOOL(WINAPI* SetThreadGroupAffinity_fn)(__in HANDLE, __in CONST GROUP_AFFINITY*, - __out_opt PGROUP_AFFINITY); -static SetThreadGroupAffinity_fn pfnSetThreadGroupAffinity = NULL; +static GROUP_AFFINITY nativeMask_; #pragma section(".CRT$XCU", long, read) __declspec(allocate(".CRT$XCU")) bool (*__init)(void) = Os::init; @@ -81,12 +74,9 @@ bool Os::init() { QueryPerformanceFrequency(&frequency); PerformanceFrequency = (double)frequency.QuadPart; - HMODULE handle = ::LoadLibrary("kernel32.dll"); - if (handle != NULL) { - pfnSetThreadGroupAffinity = - (SetThreadGroupAffinity_fn)::GetProcAddress(handle, "SetThreadGroupAffinity"); - pfnGetNumaNodeProcessorMaskEx = (BOOL(WINAPI*)(USHORT, PGROUP_AFFINITY))::GetProcAddress( - handle, "GetNumaNodeProcessorMaskEx"); + if (!GetThreadGroupAffinity(GetCurrentThread(), &nativeMask_)) { + ClPrint(amd::LOG_ERROR, amd::LOG_INIT, "Failed getting main thread affinity with error %d", + GetLastError()); } return Thread::init(); @@ -250,8 +240,6 @@ static void SetThreadName(DWORD threadId, const char* name) { void Os::setCurrentThreadName(const char* name) { SetThreadName(GetCurrentThreadId(), name); } -void Os::setPreferredNumaNode(uint32_t node) {}; - static LONG WINAPI divExceptionFilter(struct _EXCEPTION_POINTERS* ep) { DWORD code = ep->ExceptionRecord->ExceptionCode; @@ -325,25 +313,33 @@ const void* Os::createOsThread(Thread* thread) { return reinterpret_cast(handle); } +// This function only works with CPU core number <= 64. +// SetThreadGroupAffinity does clear the thread's affinity to other processor groups. +// No API yet to set multi-group affinity. +// So only the last group will take affect in this function! void Os::setThreadAffinity(const void* handle, const Os::ThreadAffinityMask& mask) { - if (pfnSetThreadGroupAffinity != NULL) { - GROUP_AFFINITY group = {0}; - for (WORD i = 0; i < sizeof(mask.mask_) / sizeof(KAFFINITY); ++i) { - group.Mask = mask.mask_[i]; - group.Group = i; - if (group.Mask != 0) { - pfnSetThreadGroupAffinity((HANDLE)handle, &group, NULL); - } - } - } else { // pfnSetThreadGroupAffinity == NULL - DWORD_PTR threadAffinityMask = (DWORD_PTR)mask.mask_[0]; - if (threadAffinityMask != 0) { - ::SetThreadAffinityMask((HANDLE)handle, threadAffinityMask); + GROUP_AFFINITY group = {0}; + for (WORD i = 0; i < sizeof(mask.mask_) / sizeof(KAFFINITY); ++i) { + group.Mask = mask.mask_[i]; + group.Group = i; + if (group.Mask != 0) { + SetThreadGroupAffinity((HANDLE)handle, &group, NULL); } } } -bool Os::setThreadAffinityToMainThread() { return true; } +bool Os::setThreadAffinityToMainThread() { + if (AMD_CPU_AFFINITY) { + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Setting Affinity to the main thread's affinity"); + if (!SetThreadGroupAffinity(GetCurrentThread(), &nativeMask_, nullptr)) { + ClPrint(amd::LOG_ERROR, amd::LOG_INIT, "Failed setting main thread affinity with error %d", + GetLastError()); + return false; + } + } + return true; +} + void Os::yield() { ::SwitchToThread(); } uint64_t Os::timeNanos() { @@ -751,6 +747,63 @@ bool Os::DumpCoreFile() { return false; } // ================================================================================================ void Os::CxaDemangle(const std::string& name, std::string* result) { *result = name; } +namespace numa { + +// ================================================================================================ +NumaPolicy::NumaPolicy(const uint32_t numa_node_count) { +} + +// ================================================================================================ +bool NumaPolicy::GetMemPolicy() { + // Dummy as Windows doesn't support numa policy + return false; +} + +// ================================================================================================ +bool NumaPolicy::IsPolicySetAt(uint32_t node_index) const { + // Dummy as Windows doesn't support numa policy + return false; +} + +// ================================================================================================ +NumaNode::~NumaNode() { + if (affinity_) { + delete static_cast(affinity_); + affinity_ = nullptr; + } +} + +// ================================================================================================ +bool NumaNode::GetAffinity() { + GROUP_AFFINITY *affinity = new GROUP_AFFINITY(); + if (!GetNumaNodeProcessorMaskEx(node_index_, affinity)) { + ClPrint(amd::LOG_ERROR, amd::LOG_RESOURCE, + "Failed getting numa node(%u) affinity with error %d", + node_index_, GetLastError()); + delete affinity; + return false; + } + affinity_ = affinity; + return true; +} + +// ================================================================================================ +bool NumaNode::SchedSetAffinity() { + if (!GetAffinity()) { + return false; + } + if (!SetThreadGroupAffinity(GetCurrentThread(), + static_cast(affinity_), nullptr)) { + ClPrint(amd::LOG_ERROR, amd::LOG_RESOURCE, + "Failed setting numa node(%u) affinity onto thread with error %d", + node_index_, GetLastError()); + return false; + } + return true; +} + +} // namespace numa + } // namespace amd #endif // _WIN32 || __CYGWIN__ diff --git a/projects/hip-tests/catch/perftests/memory/CMakeLists.txt b/projects/hip-tests/catch/perftests/memory/CMakeLists.txt index 66cdb74350..efd30b458a 100644 --- a/projects/hip-tests/catch/perftests/memory/CMakeLists.txt +++ b/projects/hip-tests/catch/perftests/memory/CMakeLists.txt @@ -39,16 +39,29 @@ set(TEST_SRC ) if(HIP_PLATFORM MATCHES "amd") - set(TEST_SRC - ${TEST_SRC} - hipPerfHostNumaAlloc.cc) - -hip_add_exe_to_target(NAME perfMemoryTest + if(WIN32) + set(TEST_SRC ${TEST_SRC} hipPerfHostNumaAllocWin.cc) + hip_add_exe_to_target(NAME perfMemoryTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME perf_test) + else() + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + find_package(NUMA) + if(NUMA_FOUND) + # Test code still use libnuma apis + set(TEST_SRC ${TEST_SRC} hipPerfHostNumaAlloc.cc) + hip_add_exe_to_target(NAME perfMemoryTest TEST_SRC ${TEST_SRC} TEST_TARGET_NAME perf_test LINKER_LIBS numa) + else() + hip_add_exe_to_target(NAME perfMemoryTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME perf_test) + endif() + endif() else() -hip_add_exe_to_target(NAME perfMemoryTest + hip_add_exe_to_target(NAME perfMemoryTest TEST_SRC ${TEST_SRC} TEST_TARGET_NAME perf_test) endif() diff --git a/projects/clr/rocclr/cmake/FindNUMA.cmake b/projects/hip-tests/catch/perftests/memory/FindNUMA.cmake similarity index 100% rename from projects/clr/rocclr/cmake/FindNUMA.cmake rename to projects/hip-tests/catch/perftests/memory/FindNUMA.cmake diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc b/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc index 105c87f14a..7a7751f711 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc @@ -63,7 +63,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) { unsigned int numIter; int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); - int test = 0; // 1. Run all P2P for all sizes if (numDevices >= 2) { for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { @@ -136,7 +135,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) { CONSOLE_PRINT( "HIPPerfBufferCopySpeed[%3d] (%10u bytes) P2P s:%-5s d:%-5s i:%4u (GB/s) perf %f", testIdx, bufSize_, "dev0", "dev1", numIter, (float)perf); - test++; void* temp = malloc(bufSize_ + 4096); void* chkBuf = reinterpret_cast(temp); HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); @@ -192,7 +190,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) { CONSOLE_PRINT( "HIPPerfBufferCopySpeed[%3d] (%10u bytes) NoCU s:%-5s d:%-5s i:%4u (GB/s) perf %f", testIdx, bufSize_, "dev0", "dev0", numIter, (float)perf); - test++; void* temp = malloc(bufSize_ + 4096); void* chkBuf = reinterpret_cast(temp); HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); @@ -337,7 +334,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) { CONSOLE_PRINT( "HIPPerfBufferCopySpeed[%3d] (%10u bytes) %-5s s:%-5s d:%-5s i:%4u (GB/s) perf %f", testIdx, bufSize_, " ", strSrc, strDst, numIter, (float)perf); - test++; void* temp = malloc(bufSize_ + 4096); void* chkBuf = reinterpret_cast(temp); HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfDevMemReadSpeed.cc b/projects/hip-tests/catch/perftests/memory/hipPerfDevMemReadSpeed.cc index 887a75df16..19663b46ad 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfDevMemReadSpeed.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfDevMemReadSpeed.cc @@ -96,7 +96,7 @@ static bool hipPerfDevMemReadSpeed_test() { if (hDst[0] != (nBytes / sizeof(uint))) { DEBUG_PRINT( - "hipPerfDevMemReadSpeed - Data validation failed for warm up run! expected %lu got %u\n", + "hipPerfDevMemReadSpeed - Data validation failed for warm up run! expected %zu got %u\n", nBytes / sizeof(uint), hDst[0]); return false; } diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAlloc.cc b/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAlloc.cc index 8bfa451fb4..1a7a1fdb2f 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAlloc.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAlloc.cc @@ -24,41 +24,23 @@ THE SOFTWARE. * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - * Copies data between host and device. */ - +#include #include +#include #include // #define ENABLE_DEBUG 1 // To run it correctly, we must not export HIP_VISIBLE_DEVICES. // And we must explicitly link libnuma because of numa api move_pages(). -#define NUM_PAGES 4 +#define NUM_PAGES 100 char* h = nullptr; char* d_h = nullptr; char* m = nullptr; char* d_m = nullptr; -int page_size = 1024; +int page_size = 0; const int mode[] = {MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE}; const char* modeStr[] = {"MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE"}; -std::string exeCommand(const char* cmd) { - std::array buff; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - return result; - } - while (fgets(buff.data(), buff.size(), pipe.get()) != nullptr) { - result += buff.data(); - } - return result; -} - -int getCpuAgentCount() { - const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l"; - int cpuAgentCount = std::atoi(exeCommand(cmd).c_str()); - return cpuAgentCount; -} - bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { void* pages[NUM_PAGES]; int status[NUM_PAGES]; @@ -66,10 +48,14 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { CONSOLE_PRINT("set cpu %d, gpu %d, numaMode %d, hostMallocflags %u\n", cpuId, gpuId, numaMode, hostMallocflags); + if (gpuId >= 0) { + HIP_CHECK(hipSetDevice(gpuId)); + } if (cpuId >= 0) { unsigned long nodeMask = 1 << cpuId; // NOLINT unsigned long maxNode = sizeof(nodeMask) * 8; // NOLINT + // Will override existing numa policy in memory if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask, numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) { WARN("set_mempolicy() failed with err " << errno << "\n"); @@ -77,10 +63,6 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { } } - if (gpuId >= 0) { - HIP_CHECK(hipSetDevice(gpuId)); - } - posix_memalign(reinterpret_cast(&m), page_size, page_size * NUM_PAGES); HIP_CHECK(hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped)); HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&d_m), m, 0)); @@ -164,8 +146,9 @@ bool runTest(const int& cpuCount, const int& gpuCount, unsigned int hostMallocfl TEST_CASE("Perf_hipPerfHostNumaAlloc_test") { int gpuCount = 0; HIP_CHECK(hipGetDeviceCount(&gpuCount)); - int cpuCount = getCpuAgentCount(); - CONSOLE_PRINT("Cpu count %d, Gpu count %d\n", cpuCount, gpuCount); + int cpuCount = numa_max_node() + 1; // number of numa nodes + page_size = getpagesize(); + CONSOLE_PRINT("Cpu count %d, Gpu count %d, page_size %d\n", cpuCount, gpuCount, page_size); if (cpuCount < 0 || gpuCount < 0) { SUCCEED( diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAllocWin.cc b/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAllocWin.cc new file mode 100644 index 0000000000..c283542391 --- /dev/null +++ b/projects/hip-tests/catch/perftests/memory/hipPerfHostNumaAllocWin.cc @@ -0,0 +1,326 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @addtogroup hipHostMalloc + * @{ + * @ingroup hipHostMalloc + * `hipHostMalloc(T** ptr, size_t size, unsigned int flags)` - + * Allocate pinned host buffer. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +SIZE_T allocSize = 1024 * 1024; // 1 MB +DWORD pageSize = 0; +struct NumaNodeInfo { + WORD nodeNumber; + WORD groupNumber; + KAFFINITY mask; + ULONGLONG freeBytes; +}; + +enum class MallocType { + hostMallocType = 0, + hiphostMallocType = 1, +}; + +struct ThreadPara { + ThreadPara(NumaNodeInfo* node_ = nullptr, MallocType mallocType_ = MallocType::hostMallocType, + unsigned int flags_ = 0, int deviceId_= -1) { + node = node_; + mallocType = mallocType_; + flags = flags_; + deviceId = deviceId_; + } + NumaNodeInfo *node; + MallocType mallocType; + unsigned int flags; // flags to allocate buffer + int deviceId; // for MGPUs test +}; + +bool checkNumaNodeInfo(const PVOID buffer, const SIZE_T bufferSize, const WORD nodeNumber, + bool pinned = false) { + DWORD_PTR startPage = ((DWORD_PTR)buffer) / pageSize; + DWORD_PTR endPage = ((DWORD_PTR)buffer + bufferSize - 1) / pageSize; + DWORD_PTR numPages = (endPage - startPage) + 1; + PCHAR startPtr = (PCHAR)(pageSize * startPage); + PPSAPI_WORKING_SET_EX_INFORMATION wsInfo = static_cast( + malloc(numPages * sizeof(PSAPI_WORKING_SET_EX_INFORMATION))); + + if (wsInfo == NULL) { + std::cerr <<"Could not allocate array of PSAPI_WORKING_SET_EX_INFORMATION structures\n"; + return false; + } + + for (DWORD_PTR i = 0; i < numPages; i++) { + wsInfo[i].VirtualAddress = startPtr + i * pageSize; + } + + BOOL bResult = QueryWorkingSetEx(GetCurrentProcess(), wsInfo, + (DWORD)numPages * sizeof(PSAPI_WORKING_SET_EX_INFORMATION)); + + if (!bResult) { + std::cerr <<"QueryWorkingSetEx failed: " << GetLastError() << "\n"; + free(wsInfo); + return false; + } + bool ret = true; + for (DWORD_PTR i = 0; i < numPages; i++) { + BOOL IsValid = wsInfo[i].VirtualAttributes.Valid; + DWORD Node = wsInfo[i].VirtualAttributes.Node; + if (pinned) { + if (!IsValid ) { + std::cerr << "Page " << i << " is invalid\n"; + ret = false; + break; + } else if (nodeNumber != Node) { + std::cerr << "Page " << i << " has node " << Node << " not matching expected " << nodeNumber << "\n"; + ret = false; + break; + } + } else if (IsValid && nodeNumber != Node) { + // maybe IsValid = false for unpinned + std::cerr << "Page " << i << " has node " << Node << " not matching expected " << nodeNumber << "\n"; + ret = false; + break; + } + } + free(wsInfo); + return ret; +} + +void enumerateNumaNodes(std::vector &nodes) { + DWORD len = 0; + GetLogicalProcessorInformationEx(RelationNumaNode, nullptr, &len); + std::vector buffer(len); + if (!GetLogicalProcessorInformationEx(RelationNumaNode, + reinterpret_cast(buffer.data()), + &len)) { + std::cerr << "GetLogicalProcessorInformationEx failed. Error: " << GetLastError() << "\n"; + return; + } + + BYTE* ptr = buffer.data(); + while (ptr < buffer.data() + len) { + auto info = reinterpret_cast(ptr); + if (info->Relationship == RelationNumaNode) { + NUMA_NODE_RELATIONSHIP *numaRelation = &info->NumaNode; + NumaNodeInfo node{}; + node.nodeNumber = static_cast(numaRelation->NodeNumber); + if (!GetNumaAvailableMemoryNodeEx(node.nodeNumber, &node.freeBytes)) { + std::cerr << "GetNumaAvailableMemoryNodeEx(" << node.nodeNumber << + ") failed with Error: " << GetLastError() << "\n"; + continue; + } + if (numaRelation->GroupCount == 0) { + // Before Windows 20H2 + node.groupNumber = numaRelation->GroupMask.Group; + node.mask = numaRelation->GroupMask.Mask; + nodes.push_back(node); + } else { + // Since Windows 20H2. + GROUP_AFFINITY *groupMasks = numaRelation->GroupMasks; + for (int i = 0; i < numaRelation->GroupCount; i++) { + node.groupNumber = groupMasks[i].Group; + node.mask = groupMasks[i].Mask; + nodes.push_back(node); + } + } + } + ptr += info->Size; + } +} + +static DWORD WINAPI workerThread(LPVOID lpParam) { + ThreadPara *threadPara = reinterpret_cast(lpParam); + NumaNodeInfo* node = threadPara->node; + + PROCESSOR_NUMBER procNumber{}; + GetCurrentProcessorNumberEx(&procNumber); + std::cout << "Thread is running on processor number: " << static_cast(procNumber.Number) + << " in group: " << procNumber.Group << std::endl; + + USHORT runNode = -1; + if (GetNumaProcessorNodeEx(&procNumber, &runNode)) { + std::cout << "Thread is running on NUMA node " << runNode << "\n"; + } + else { + std::cerr << "Failed to get NUMA node. Error: " << GetLastError() << std::endl; + return -1; + } + + if (static_cast(node->nodeNumber) != runNode) { + std::cerr << "runNode " << runNode << "not matching node->nodeNumber " << + node->nodeNumber <<"\n"; + return -1; + } + + if (threadPara->deviceId >= 0) { + HIP_CHECK(hipSetDevice(threadPara->deviceId)); // doesn't matter AMD_CPU_AFFINITY is 1 or 0 + } + void* pMem = nullptr; + switch (threadPara->mallocType) { + case MallocType::hostMallocType: + // Place holder + pMem = VirtualAllocExNuma(GetCurrentProcess(), + nullptr, + allocSize, + threadPara->flags, + PAGE_READWRITE, + node->nodeNumber); + break; + case MallocType::hiphostMallocType: + HIP_CHECK(hipHostMalloc(&pMem, allocSize, threadPara->flags)); + break; + default: + return -1; + } + + if (!pMem) { + std::cerr << "NUMA allocation failed for thread " + << GetCurrentThreadId() << "on NUMA node " << node->nodeNumber << + " with Error: " << GetLastError() << "\n"; + return -1; + } + + memset(pMem, 0xCD, allocSize); + bool ret = checkNumaNodeInfo(pMem, allocSize, node->nodeNumber, + threadPara->mallocType != MallocType::hostMallocType); + + switch (threadPara->mallocType) { + case MallocType::hostMallocType: + VirtualFree(pMem, 0, MEM_RELEASE); + break; + case MallocType::hiphostMallocType: + HIP_CHECK(hipHostFree(pMem)); + break; + default: + return -1; + } + return ret ? 0 : -1; +} + +static void runTestPrefered(std::vector &nodes, MallocType type, unsigned int flags, + const char *description) { + int gpuCount = 0; + HIP_CHECK(hipGetDeviceCount(&gpuCount)); + std::cout << std::dec; + std::vector threadHandles; + std::vector paras; + paras.reserve(gpuCount * nodes.size()); + int index = 0; + for (int dev = 0; dev < gpuCount; dev++) { + int numaNode = -1; + HIP_CHECK(hipDeviceGetAttribute(&numaNode, hipDeviceAttributeHostNumaId, dev)); + if (numaNode == -1) { + continue; // Impossible here + } + for (auto& node : nodes) { + if (numaNode != node.nodeNumber) { + continue; + } + if (node.freeBytes < allocSize) { + std::cerr << "node.freeBytes " << node.freeBytes <<" < allocSize " << allocSize << "\n"; + continue; + } + // For best perf, we prefer creating a thread on the host numa node of the gpu device. + auto& ref = paras.emplace_back(&node, type, flags, dev); + HANDLE hThread = CreateThread(nullptr, 0, workerThread, &ref, CREATE_SUSPENDED, nullptr); + if (!hThread) { + std::cerr << "Thread creation failed. Error: " << GetLastError() << "\n"; + continue; + } + GROUP_AFFINITY ga = {}; + ga.Group = node.groupNumber; + ga.Mask = node.mask; + GROUP_AFFINITY prev = {}; + if (!SetThreadGroupAffinity(hThread, &ga, &prev)) { + std::cerr << "SetThreadGroupAffinity failed. Error: " << GetLastError() << "\n"; + CloseHandle(hThread); + continue; + } + std::cout << "dev " << dev << ", thread " << index++ << ": Group: " << ga.Group << + ", Mask: " << std::hex << ga.Mask << "; prev: Group: " << std::dec << prev.Group << + ", Mask: " << std::hex << prev.Mask << std::dec <<"\n"; + ResumeThread(hThread); + threadHandles.push_back(hThread); + // A single NUMA node can span multiple processor groups on systems with more than 64 processors, + // so we will continue searching for next node of the same nodeNumber. + } + } + + // Wait for all threads + WaitForMultipleObjects((DWORD)threadHandles.size(), threadHandles.data(), TRUE, INFINITE); + bool result = true; + for (auto h : threadHandles) { + DWORD exitCode = 0; + if (GetExitCodeThread(h, &exitCode)) { + result &= (exitCode == 0); + } else { + result = false; + } + CloseHandle(h); + } + + std::cout << description << (result ? " passed\n" : " failed\n"); + REQUIRE(result); +} + +/* Test memory allocation on preferred host numa node on each CPU */ +TEST_CASE("Perf_hipPerfHostNumaAlloc_test_preferred_host_numa_node_on_each_GPU") { + std::vector nodes; + enumerateNumaNodes(nodes); + if (nodes.empty()) { + std::cerr << "No NUMA nodes found.\n"; + REQUIRE(false); + } + SYSTEM_INFO systemInfo; + GetSystemInfo(&systemInfo); + pageSize = systemInfo.dwPageSize; + std::cout << "logic processor count " << systemInfo.dwNumberOfProcessors + << ", page size " << pageSize << "\n"; + int numaNode = -1; + HIP_CHECK(hipDeviceGetAttribute(&numaNode, hipDeviceAttributeHostNumaId, 0)); + if (numaNode == -1) { + HipTest::HIP_SKIP_TEST("Host NUMA isn't supported hence skipping the test...\n"); + return; + } + HIP_CHECK(hipSetDevice(0)); + // In windows, it is the same with / without hipHostMallocNumaUser + runTestPrefered(nodes, + MallocType::hiphostMallocType, hipHostMallocDefault | hipHostMallocNumaUser, + "hiphostMalloc(hipHostMallocDefault | hipHostMallocNumaUser) on preferred numa node"); + runTestPrefered(nodes, + MallocType::hiphostMallocType, hipHostAllocMapped | hipHostMallocNumaUser, + "hiphostMalloc(hipHostAllocMapped | hipHostMallocNumaUser) on preferred numa node"); + runTestPrefered(nodes, + MallocType::hostMallocType, MEM_RESERVE | MEM_COMMIT, + "VirtualAllocExNuma(MEM_RESERVE | MEM_COMMIT) on preferred numa node"); +} +/** + * End doxygen group hipHostMalloc. + * @} + */ diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfMemcpy.cc b/projects/hip-tests/catch/perftests/memory/hipPerfMemcpy.cc index d5ad0786aa..c790fb5cdd 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfMemcpy.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfMemcpy.cc @@ -82,11 +82,8 @@ void hipPerfMemcpy::TestResult(unsigned int numTests, perf *= 2.0; } - CONSOLE_PRINT("hipPerfMemcpy[%d] %s copy BW %.2f GB/s for memory size of %lu Bytes.\n", numTests, + CONSOLE_PRINT("hipPerfMemcpy[%d] %s copy BW %.2f GB/s for memory size of %zu Bytes.\n", numTests, typestr, perf, totalSizes_[numTests]); - - if (totalSizes_[numTests] == 4194304 && type == hipMemcpyDeviceToDeviceNoCU) - REQUIRE(perf < NOCU_MAX_BW); } bool hipPerfMemcpy::run_h2d(unsigned int numTests) { diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc b/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc index aaf434245b..49808d4967 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfMemset.cc @@ -335,7 +335,7 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, auto sec = diff.count(); auto perf = static_cast((sizeElements * NUM_ITER * (1e-09)) / sec); - CONSOLE_PRINT("hipPerf3DMemset%s[%d] (GB/s) for %5lu x %5lu x %lu bytes : %7.2f\n", + CONSOLE_PRINT("hipPerf3DMemset%s[%d] (GB/s) for %5zu x %5zu x %zu bytes : %7.2f\n", (async ? "Async" : " "), test, bufSize_, bufSize_, depth, perf); HIP_CHECK(hipFree(devPitchedPtr.ptr)); free(A_h); diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h index 5771ef3fcf..847e11a962 100644 --- a/projects/hip/include/hip/hip_runtime_api.h +++ b/projects/hip/include/hip/hip_runtime_api.h @@ -557,6 +557,8 @@ typedef enum hipDeviceAttribute_t { ///< hipHostRegister hipDeviceAttributeMemoryPoolSupportedHandleTypes, ///< Supported handle mask for HIP Stream ///< Ordered Memory Allocator + hipDeviceAttributeHostNumaId, ///< NUMA ID of the cpu node closest to the device, + ///< or -1 when NUMA isn't supported hipDeviceAttributeCudaCompatibleEnd = 9999, hipDeviceAttributeAmdSpecificBegin = 10000, diff --git a/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h index 22c8afee56..1ffdd062bf 100644 --- a/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h +++ b/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h @@ -2978,6 +2978,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeMemoryPoolSupportedHandleTypes: cdattr = cudaDevAttrMemoryPoolSupportedHandleTypes; break; + case hipDeviceAttributeHostNumaId: + cdattr = cudaDevAttrHostNumaId; + break; default: return hipCUDAErrorTohipError(cudaErrorInvalidValue); }