SWDEV-299893 - Set preferred node affinity

Set affinity to the closest node of the current GPU. This reduces
the latency to fetch kernel args since device would query the CPU cache
of core which did the dispatch. This behavior is controlled with
AMD_CPU_AFFINITY env var(disabled by default)

Change-Id: I65afba62cb818ea25a311b88d1c0dd5c51330292


[ROCm/clr commit: b192beea52]
Tento commit je obsažen v:
Saleel Kudchadker
2021-11-09 03:12:19 -08:00
rodič 8ff18d5e6f
revize 04a391004a
6 změnil soubory, kde provedl 31 přidání a 4 odebrání
+2
Zobrazit soubor
@@ -1730,6 +1730,8 @@ class Device : public RuntimeObject {
) const {
return false;
};
virtual const uint32_t getPreferredNumaNode() const { return 0; }
virtual void ReleaseGlobalSignal(void* signal) const {}
//! Returns TRUE if the device is available for computations
+3 -2
Zobrazit soubor
@@ -170,7 +170,8 @@ Device::Device(hsa_agent_t bkendDevice)
, queuePool_(QueuePriority::Total)
, coopHostcallBuffer_(nullptr)
, queueWithCUMaskPool_(QueuePriority::Total)
, numOfVgpus_(0) {
, numOfVgpus_(0)
, preferred_numa_node_(0) {
group_segment_.handle = 0;
system_segment_.handle = 0;
system_coarse_segment_.handle = 0;
@@ -194,7 +195,7 @@ void Device::setupCpuAgent() {
}
}
}
preferred_numa_node_ = index;
cpu_agent_ = cpu_agents_[index].agent;
system_segment_ = cpu_agents_[index].fine_grain_pool;
system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool;
+3
Zobrazit soubor
@@ -534,6 +534,8 @@ class Device : public NullDevice {
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);
const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
private:
bool create();
@@ -555,6 +557,7 @@ class Device : public NullDevice {
static std::vector<AgentInfo> cpu_agents_;
hsa_agent_t cpu_agent_;
uint32_t preferred_numa_node_;
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
std::vector<Device*> enabled_p2p_devices_; //!< List of user enabled P2P devices for this device
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
+3
Zobrazit soubor
@@ -222,6 +222,9 @@ class Os : AllStatic {
//! Platform-specific optimized memcpy()
static void* fastMemcpy(void* dest, const void* src, size_t n);
//! NUMA related settings
static void setPreferredNumaNode(uint32_t node);
// File/Path helper routines:
//
+18 -2
Zobrazit soubor
@@ -48,6 +48,10 @@
#define DT_GNU_HASH 0x6ffffef5
#endif // DT_GNU_HASH
#ifdef ROCCLR_SUPPORT_NUMA_POLICY
#include <numa.h>
#endif // ROCCLR_SUPPORT_NUMA_POLICY
#include <atomic>
#include <vector>
#include <string>
@@ -60,7 +64,6 @@
#include <algorithm>
#include <mutex>
namespace amd {
static struct sigaction oldSigAction;
@@ -121,7 +124,6 @@ static void divisionErrorHandler(int sig, siginfo_t* info, void* ptr) {
return;
}
std::cerr << "Unhandled signal in divisionErrorHandler()" << std::endl;
::abort();
}
@@ -306,6 +308,20 @@ void Os::currentStackInfo(address* base, size_t* size) {
void Os::setCurrentThreadName(const char* name) { ::prctl(PR_SET_NAME, name); }
void Os::setPreferredNumaNode(uint32_t node) {
if (AMD_CPU_AFFINITY) {
// Set preferred node affinity mask
int num_cpus = numa_num_configured_cpus();
bitmask* bm = numa_bitmask_alloc(num_cpus);
numa_node_to_cpus(node, bm);
if (numa_sched_setaffinity(0, bm) < 0) {
assert(0 && "failed to set affinity");
}
numa_free_cpumask(bm);
}
}
void* Thread::entry(Thread* thread) {
sigset_t set;
+2
Zobrazit soubor
@@ -250,6 +250,8 @@ static void SetThreadName(DWORD threadId, const char* name) {
void Os::setCurrentThreadName(const char* name) { SetThreadName(GetCurrentThreadId(), name); }
void Os::setPreferredNumaNode(uint32_t node) {};
static LONG WINAPI divExceptionFilter(struct _EXCEPTION_POINTERS* ep) {
DWORD code = ep->ExceptionRecord->ExceptionCode;