[SWDEV-548460] Add RDC Policy Reset Message (#2180)

* [SWDEV-548460] Add RDC Policy Reset Message

* [rdc] Bump version to 1.3.0

Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>

* chore: [rdc] Format CMakeLists.txt

Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>

---------

Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
Co-authored-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
This commit is contained in:
Adam Pryor
2025-12-29 10:31:13 -06:00
کامیت شده توسط GitHub
والد 741b4b9fdf
کامیت 5bf6e366dd
10فایلهای تغییر یافته به همراه37 افزوده شده و 15 حذف شده
+11 -7
مشاهده پرونده
@@ -23,9 +23,13 @@
#
cmake_minimum_required(VERSION 3.15)
set( COMP_TYPE "runtime" )
set( BUILD_ENABLE_LINTIAN_OVERRIDES ON CACHE BOOL "Enable/Disable Lintian Overrides" )
set( BUILD_DEBIAN_PKGING_FLAG ON CACHE BOOL "Internal Status Flag to indicate Debian Packaging Build" )
set(COMP_TYPE "runtime")
set(BUILD_ENABLE_LINTIAN_OVERRIDES ON CACHE BOOL "Enable/Disable Lintian Overrides")
set(BUILD_DEBIAN_PKGING_FLAG
ON
CACHE BOOL
"Internal Status Flag to indicate Debian Packaging Build"
)
set(RDC "rdc" CACHE INTERNAL "")
set(RDC_PACKAGE ${RDC} CACHE STRING "")
@@ -89,7 +93,7 @@ include(utils)
set(PKG_VERSION_GIT_TAG_PREFIX "rdc_pkg_ver")
# Provide git to utilities
find_program(GIT NAMES git)
get_version_from_tag("1.2.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
get_version_from_tag("1.3.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
# VERSION_* variables should be set by get_version_from_tag
message("Package version: ${VERSION_STRING}")
@@ -167,8 +171,8 @@ set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/lib64 /usr/lib/x86_64-linux-gn
# configure packaging
# cpack version is populated with CMAKE_PROJECT_VERSION implicitly
set(PKG_MAINTAINER_NM "RDC Support")
set(PKG_MAINTAINER_EMAIL "rdc.support@amd.com")
set(PKG_MAINTAINER_NM "RDC Support")
set(PKG_MAINTAINER_EMAIL "rdc.support@amd.com")
set(CPACK_PACKAGE_NAME ${RDC_PACKAGE} CACHE INTERNAL "")
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." CACHE STRING "")
set(CPACK_PACKAGE_CONTACT "${PKG_MAINTAINER_NM} <${PKG_MAINTAINER_EMAIL}>")
@@ -565,7 +569,7 @@ set(CPACK_RPM_RUNTIME_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RP
set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}")
set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}")
configure_pkg( ${RDC} ${COMP_TYPE} ${CPACK_PACKAGE_VERSION} ${PKG_MAINTAINER_NM} ${PKG_MAINTAINER_EMAIL} )
configure_pkg( ${RDC} ${COMP_TYPE} ${CPACK_PACKAGE_VERSION} ${PKG_MAINTAINER_NM} ${PKG_MAINTAINER_EMAIL})
include(CPack)
@@ -1537,6 +1537,8 @@ typedef struct {
rdc_policy_condition_t condition; //!< the condition that is meet
rdc_gpu_group_t group_id; //!< The group id trigger this callback
int64_t value; //!< The current value that meet the condition
uint32_t gpu_index; //!< GPU index that hit the condition
bool reset_triggered; //!< if reset was attempted
} rdc_policy_callback_response_t;
/**
@@ -659,6 +659,8 @@ message RegisterPolicyResponse {
PolicyCondition condition =3;
uint32 group_id =4;
uint64 value=5;
uint32 gpu_index=6;
bool reset_triggered = 7;
}
message UnRegisterPolicyResult {
@@ -58,7 +58,7 @@ find_program(GIT NAMES git)
# Debian package specific variables
# Set a default value for the package version
get_version_from_tag("1.2.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
get_version_from_tag("1.3.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
# VERSION_* variables should be set by get_version_from_tag
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
@@ -218,14 +218,17 @@ void RdcPolicyImpl::rdc_policy_check_condition() {
status = metric_fetcher_->fetch_smi_field(gpu_index, map[policy.condition.type], &value);
if (status == RDC_ST_OK) {
if (value.value.l_int > policy.condition.value) {
bool reset = RDC_POLICY_ACTION_GPU_RESET == policy.action;
// callback if needed
if (callback) {
rdc_policy_callback_response_t response = {1, policy.condition, group_id,
value.value.l_int};
value.value.l_int, gpu_index, reset};
callback(&response);
}
if (RDC_POLICY_ACTION_GPU_RESET == policy.action) {
if (reset) {
rdc_policy_gpu_reset(gpu_index);
}
}
@@ -918,6 +918,8 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id,
response.condition.value = cond.value();
response.group_id = reply.group_id();
response.value = reply.value();
response.gpu_index = reply.gpu_index();
response.reset_triggered = reply.reset_triggered();
callback(&response);
}
@@ -195,8 +195,16 @@ int rdc_policy_callback(rdc_policy_callback_response_t* userData) {
threshold /= 1000000;
}
std::cout << "A " << condition_type_to_str(userData->condition.type) << " exceeds the threshold "
<< threshold << " with the value " << value << std::endl;
std::cout << "A " << condition_type_to_str(userData->condition.type)
<< " exceeds the threshold " << threshold
<< " with the value " << value;
if (userData->reset_triggered) {
std::cout << " and triggered reset on GPU " << userData->gpu_index;
}
std::cout << std::endl;
last_time = now; // update the last time
return 0;
}
@@ -41,7 +41,7 @@ THE SOFTWARE.
#include "rdc_lib/rdc_common.h"
#define RDC_CLIENT_VERSION_MAJOR 1
#define RDC_CLIENT_VERSION_MINOR 2
#define RDC_CLIENT_VERSION_MINOR 3
#define RDC_CLIENT_VERSION_RELEASE 0
#define RDC_CLIENT_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE)
@@ -31,7 +31,7 @@ THE SOFTWARE.
#include "rdc/rdc_api_service.h"
#define RDC_SERVER_VERSION_MAJOR 1
#define RDC_SERVER_VERSION_MINOR 2
#define RDC_SERVER_VERSION_MINOR 3
#define RDC_SERVER_VERSION_RELEASE 0
#define RDC_SERVER_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE)
@@ -931,7 +931,8 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
::rdc::PolicyCondition* cond = reply.mutable_condition();
cond->set_type(static_cast<::rdc::PolicyCondition_Type>(ctx->response.condition.type));
cond->set_value(ctx->response.condition.value);
reply.set_gpu_index(ctx->response.gpu_index);
reply.set_reset_triggered(ctx->response.reset_triggered != 0);
writer->Write(reply);
}