From cc48dfdbffbcd1920e0b9f77a5d1fdc095d8bd78 Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Tue, 15 Nov 2022 17:21:59 +0000 Subject: [PATCH] Use mwaitx when busy-waiting signals Use mwaitx instructions when busy waiting for signals to reduce CPU energy usage. This can be disabled by setting HSA_ENABLE_MWAITX=0 Change-Id: Ic207895a491b2bf6dacba47ef0921df3faad5b5a --- runtime/hsa-runtime/CMakeLists.txt | 2 +- runtime/hsa-runtime/core/inc/runtime.h | 1 + .../core/runtime/default_signal.cpp | 11 +++++++- .../core/runtime/interrupt_signal.cpp | 17 +++++++++++-- runtime/hsa-runtime/core/runtime/runtime.cpp | 6 +++++ runtime/hsa-runtime/core/runtime/signal.cpp | 1 + runtime/hsa-runtime/core/util/flag.h | 10 ++++++++ runtime/hsa-runtime/inc/hsa.h | 25 +++++++++++-------- 8 files changed, 59 insertions(+), 14 deletions(-) diff --git a/runtime/hsa-runtime/CMakeLists.txt b/runtime/hsa-runtime/CMakeLists.txt index c1961443f1..88c18c011d 100644 --- a/runtime/hsa-runtime/CMakeLists.txt +++ b/runtime/hsa-runtime/CMakeLists.txt @@ -129,7 +129,7 @@ target_include_directories( ${CORE_RUNTIME_TARGET} set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64" ) ## ------------------------- Linux Compiler and Linker options ------------------------- -set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function ) +set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function -mmwaitx ) ## Extra image settings - audit! set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -Wno-deprecated-declarations ) diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 680fd833a3..c0c1344f2a 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -86,6 +86,7 @@ namespace AMD { namespace core { extern bool g_use_interrupt_wait; +extern bool g_use_mwaitx; /// @brief Runtime class provides the following functions: /// - open and close connection to kernel driver. diff --git a/runtime/hsa-runtime/core/runtime/default_signal.cpp b/runtime/hsa-runtime/core/runtime/default_signal.cpp index 281b295f24..bd2f7cf1fc 100644 --- a/runtime/hsa-runtime/core/runtime/default_signal.cpp +++ b/runtime/hsa-runtime/core/runtime/default_signal.cpp @@ -42,6 +42,9 @@ #include "core/inc/default_signal.h" #include "core/util/timer.h" +#include + +#define MWAITX_ECX_TIMER_ENABLE 0x2 // BIT(1) namespace rocr { namespace core { @@ -100,6 +103,8 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition, timer::duration_from_seconds( double(timeout) / double(hsa_freq)); + if (g_use_mwaitx) _mm_monitorx(const_cast(&signal_.value), 0, 0); + while (true) { if (!IsValid()) return 0; @@ -132,8 +137,12 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition, value = atomic::Load(&signal_.value, std::memory_order_relaxed); return hsa_signal_value_t(value); } - if (time - start_time > kMaxElapsed) { + + if (time - start_time > kMaxElapsed) os::uSleep(20); + else if (g_use_mwaitx) { + _mm_mwaitx(0, 60000, MWAITX_ECX_TIMER_ENABLE); // 60000 ~20us on a 1.5Ghz CPU + _mm_monitorx(const_cast(&signal_.value), 0, 0); } } } diff --git a/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp b/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp index 0a9ff803c5..773bbffa1e 100644 --- a/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp +++ b/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp @@ -44,6 +44,9 @@ #include "core/inc/runtime.h" #include "core/util/timer.h" #include "core/util/locks.h" +#include + +#define MWAITX_ECX_TIMER_ENABLE 0x2 // BIT(1) namespace rocr { namespace core { @@ -162,6 +165,8 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( double(timeout) / double(hsa_freq)); bool condition_met = false; + if (g_use_mwaitx) _mm_monitorx(const_cast(&signal_.value), 0, 0); + while (true) { if (!IsValid()) return 0; @@ -194,13 +199,21 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( value = atomic::Load(&signal_.value, std::memory_order_relaxed); return hsa_signal_value_t(value); } - + if (wait_hint == HSA_WAIT_STATE_ACTIVE) { + if (g_use_mwaitx) { + _mm_mwaitx(0, 0, 0); + _mm_monitorx(const_cast(&signal_.value), 0, 0); + } continue; } if (time - start_time < kMaxElapsed) { - // os::uSleep(20); + // os::uSleep(20); + if (g_use_mwaitx) { + _mm_mwaitx(0, 60000, MWAITX_ECX_TIMER_ENABLE); + _mm_monitorx(const_cast(&signal_.value), 0, 0); + } continue; } diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 9a56594ddb..e9f44868a1 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -71,6 +71,7 @@ const char rocrbuildid[] __attribute__((used)) = "ROCR BUILD ID: " STRING(ROCR_B namespace rocr { namespace core { bool g_use_interrupt_wait = true; +bool g_use_mwaitx = true; Runtime* Runtime::runtime_singleton_ = NULL; @@ -680,6 +681,10 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { *(bool*)value = ret; break; } + case HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED: { + *((bool*)value) = g_use_mwaitx; + break; + } default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -1405,6 +1410,7 @@ hsa_status_t Runtime::Load() { flag_.Refresh(); g_use_interrupt_wait = flag_.enable_interrupt(); + g_use_mwaitx = flag_.check_mwaitx(cpuinfo.mwaitx); if (!AMD::Load()) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; diff --git a/runtime/hsa-runtime/core/runtime/signal.cpp b/runtime/hsa-runtime/core/runtime/signal.cpp index d424e29a06..486a2a305e 100644 --- a/runtime/hsa-runtime/core/runtime/signal.cpp +++ b/runtime/hsa-runtime/core/runtime/signal.cpp @@ -245,6 +245,7 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals, bool condition_met = false; while (true) { + // Cannot mwaitx - polling multiple signals for (uint32_t i = 0; i < signal_count; i++) { if (!signals[i]->IsValid()) return uint32_t(-1); diff --git a/runtime/hsa-runtime/core/util/flag.h b/runtime/hsa-runtime/core/util/flag.h index 2b0ffcff09..685899001d 100644 --- a/runtime/hsa-runtime/core/util/flag.h +++ b/runtime/hsa-runtime/core/util/flag.h @@ -163,6 +163,9 @@ class Flag { var = os::GetEnvVar("HSA_IMAGE_PRINT_SRD"); image_print_srd_ = (var == "1") ? true : false; + var = os::GetEnvVar("HSA_ENABLE_MWAITX"); + enable_mwaitx_ = (var == "0") ? false : true; + // Temporary environment variable to disable CPU affinity override // Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely. var = os::GetEnvVar("HSA_OVERRIDE_CPU_AFFINITY_DEBUG"); @@ -224,6 +227,12 @@ class Flag { bool image_print_srd() const { return image_print_srd_; } + bool check_mwaitx(bool mwaitx_supported) { + if (enable_mwaitx_ && !mwaitx_supported) enable_mwaitx_ = false; + + return enable_mwaitx_; + } + XNACK_REQUEST xnack() const { return xnack_; } bool debug() const { return debug_; } @@ -266,6 +275,7 @@ class Flag { bool discover_copy_agents_; bool override_cpu_affinity_; bool image_print_srd_; + bool enable_mwaitx_; SDMA_OVERRIDE enable_sdma_; diff --git a/runtime/hsa-runtime/inc/hsa.h b/runtime/hsa-runtime/inc/hsa.h index 96730baf35..8b668f1e9e 100644 --- a/runtime/hsa-runtime/inc/hsa.h +++ b/runtime/hsa-runtime/inc/hsa.h @@ -482,19 +482,24 @@ typedef enum { */ HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, /** - * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of - * this attribute is bool. - */ + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, // TODO: Should this be per Agent? /** - * Returns true if all Agents have access to system allocated memory (such as - * that allocated by mmap, malloc, or new) by default. - * If false then system allocated memory may only be made SVM accessible to - * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. - * The type of this attribute is bool. - */ - HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202 + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202, + /** + * Returns true if mwaitx is enabled on this system + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203 } hsa_system_info_t; /**