diff --git a/projects/rdc/CMakeLists.txt b/projects/rdc/CMakeLists.txt index 907c9fbff4..a63240e058 100755 --- a/projects/rdc/CMakeLists.txt +++ b/projects/rdc/CMakeLists.txt @@ -23,9 +23,13 @@ # cmake_minimum_required(VERSION 3.15) -set( COMP_TYPE "runtime" ) -set( BUILD_ENABLE_LINTIAN_OVERRIDES ON CACHE BOOL "Enable/Disable Lintian Overrides" ) -set( BUILD_DEBIAN_PKGING_FLAG ON CACHE BOOL "Internal Status Flag to indicate Debian Packaging Build" ) +set(COMP_TYPE "runtime") +set(BUILD_ENABLE_LINTIAN_OVERRIDES ON CACHE BOOL "Enable/Disable Lintian Overrides") +set(BUILD_DEBIAN_PKGING_FLAG + ON + CACHE BOOL + "Internal Status Flag to indicate Debian Packaging Build" +) set(RDC "rdc" CACHE INTERNAL "") set(RDC_PACKAGE ${RDC} CACHE STRING "") @@ -89,7 +93,7 @@ include(utils) set(PKG_VERSION_GIT_TAG_PREFIX "rdc_pkg_ver") # Provide git to utilities find_program(GIT NAMES git) -get_version_from_tag("1.2.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_version_from_tag("1.3.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) # VERSION_* variables should be set by get_version_from_tag message("Package version: ${VERSION_STRING}") @@ -167,8 +171,8 @@ set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/lib64 /usr/lib/x86_64-linux-gn # configure packaging # cpack version is populated with CMAKE_PROJECT_VERSION implicitly -set(PKG_MAINTAINER_NM "RDC Support") -set(PKG_MAINTAINER_EMAIL "rdc.support@amd.com") +set(PKG_MAINTAINER_NM "RDC Support") +set(PKG_MAINTAINER_EMAIL "rdc.support@amd.com") set(CPACK_PACKAGE_NAME ${RDC_PACKAGE} CACHE INTERNAL "") set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." CACHE STRING "") set(CPACK_PACKAGE_CONTACT "${PKG_MAINTAINER_NM} <${PKG_MAINTAINER_EMAIL}>") @@ -565,7 +569,7 @@ set(CPACK_RPM_RUNTIME_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RP set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") -configure_pkg( ${RDC} ${COMP_TYPE} ${CPACK_PACKAGE_VERSION} ${PKG_MAINTAINER_NM} ${PKG_MAINTAINER_EMAIL} ) +configure_pkg( ${RDC} ${COMP_TYPE} ${CPACK_PACKAGE_VERSION} ${PKG_MAINTAINER_NM} ${PKG_MAINTAINER_EMAIL}) include(CPack) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index a7fe956c39..c7cb7d392b 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -1537,6 +1537,8 @@ typedef struct { rdc_policy_condition_t condition; //!< the condition that is meet rdc_gpu_group_t group_id; //!< The group id trigger this callback int64_t value; //!< The current value that meet the condition + uint32_t gpu_index; //!< GPU index that hit the condition + bool reset_triggered; //!< if reset was attempted } rdc_policy_callback_response_t; /** diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index 8af03999d7..b2ded4ab27 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -659,6 +659,8 @@ message RegisterPolicyResponse { PolicyCondition condition =3; uint32 group_id =4; uint64 value=5; + uint32 gpu_index=6; + bool reset_triggered = 7; } message UnRegisterPolicyResult { diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index 136c3cfd05..e07f597560 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -58,7 +58,7 @@ find_program(GIT NAMES git) # Debian package specific variables # Set a default value for the package version -get_version_from_tag("1.2.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) +get_version_from_tag("1.3.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) # VERSION_* variables should be set by get_version_from_tag set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") diff --git a/projects/rdc/rdc_libs/rdc/src/RdcPolicyImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcPolicyImpl.cc index e2fc262401..ee688b72a1 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcPolicyImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcPolicyImpl.cc @@ -218,14 +218,17 @@ void RdcPolicyImpl::rdc_policy_check_condition() { status = metric_fetcher_->fetch_smi_field(gpu_index, map[policy.condition.type], &value); if (status == RDC_ST_OK) { if (value.value.l_int > policy.condition.value) { + + bool reset = RDC_POLICY_ACTION_GPU_RESET == policy.action; + // callback if needed if (callback) { rdc_policy_callback_response_t response = {1, policy.condition, group_id, - value.value.l_int}; + value.value.l_int, gpu_index, reset}; callback(&response); } - if (RDC_POLICY_ACTION_GPU_RESET == policy.action) { + if (reset) { rdc_policy_gpu_reset(gpu_index); } } diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 9bd0c28297..a3587771b5 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -918,6 +918,8 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id, response.condition.value = cond.value(); response.group_id = reply.group_id(); response.value = reply.value(); + response.gpu_index = reply.gpu_index(); + response.reset_triggered = reply.reset_triggered(); callback(&response); } diff --git a/projects/rdc/rdci/src/RdciPolicySubSystem.cc b/projects/rdc/rdci/src/RdciPolicySubSystem.cc index c8bb6fa15e..ad85ed8ab6 100644 --- a/projects/rdc/rdci/src/RdciPolicySubSystem.cc +++ b/projects/rdc/rdci/src/RdciPolicySubSystem.cc @@ -195,8 +195,16 @@ int rdc_policy_callback(rdc_policy_callback_response_t* userData) { threshold /= 1000000; } - std::cout << "A " << condition_type_to_str(userData->condition.type) << " exceeds the threshold " - << threshold << " with the value " << value << std::endl; + std::cout << "A " << condition_type_to_str(userData->condition.type) + << " exceeds the threshold " << threshold + << " with the value " << value; + + if (userData->reset_triggered) { + std::cout << " and triggered reset on GPU " << userData->gpu_index; + } + + std::cout << std::endl; + last_time = now; // update the last time return 0; } diff --git a/projects/rdc/rdci/src/rdci.cc b/projects/rdc/rdci/src/rdci.cc index 65ac82ca7a..f81c2fb97f 100644 --- a/projects/rdc/rdci/src/rdci.cc +++ b/projects/rdc/rdci/src/rdci.cc @@ -41,7 +41,7 @@ THE SOFTWARE. #include "rdc_lib/rdc_common.h" #define RDC_CLIENT_VERSION_MAJOR 1 -#define RDC_CLIENT_VERSION_MINOR 2 +#define RDC_CLIENT_VERSION_MINOR 3 #define RDC_CLIENT_VERSION_RELEASE 0 #define RDC_CLIENT_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE) diff --git a/projects/rdc/server/include/rdc/rdc_server_main.h b/projects/rdc/server/include/rdc/rdc_server_main.h index 75e58b7dd9..bc13f1df09 100644 --- a/projects/rdc/server/include/rdc/rdc_server_main.h +++ b/projects/rdc/server/include/rdc/rdc_server_main.h @@ -31,7 +31,7 @@ THE SOFTWARE. #include "rdc/rdc_api_service.h" #define RDC_SERVER_VERSION_MAJOR 1 -#define RDC_SERVER_VERSION_MINOR 2 +#define RDC_SERVER_VERSION_MINOR 3 #define RDC_SERVER_VERSION_RELEASE 0 #define RDC_SERVER_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE) diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index 06b25a72ed..d87463f0c2 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -931,7 +931,8 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) ::rdc::PolicyCondition* cond = reply.mutable_condition(); cond->set_type(static_cast<::rdc::PolicyCondition_Type>(ctx->response.condition.type)); cond->set_value(ctx->response.condition.value); - + reply.set_gpu_index(ctx->response.gpu_index); + reply.set_reset_triggered(ctx->response.reset_triggered != 0); writer->Write(reply); }