diff --git a/common/rdc_field.data b/common/rdc_field.data index 1a2deb6c57..5f93337a47 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -53,6 +53,8 @@ FLD_DESC_ENT(RDC_FI_GPU_MM_ENC_UTIL, "Mutilmedia encoder busy percentage", FLD_DESC_ENT(RDC_FI_GPU_MM_DEC_UTIL, "Mutilmedia decoder busy percentage", "GPU_MM_DEC_UTIL", true) FLD_DESC_ENT(RDC_FI_GPU_MEMORY_ACTIVITY, "Memory busy percentage", "GPU_MEM_UTIL", true) +FLD_DESC_ENT(RDC_FI_GPU_PAGE_RETRIED, "Retried page of the GPU instance", "GPU_PAGE_RETRIED", true) + // ECC totals FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true) FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 01f251d1dd..ac6019b2fe 100755 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -113,6 +113,13 @@ set(ROCPROFILER_EXAMPLE_EXE "rocprofiler") add_executable(${ROCPROFILER_EXAMPLE_EXE} "${ROCPROFILER_EXAMPLE_SRC_LIST}") target_link_libraries(${ROCPROFILER_EXAMPLE_EXE} pthread dl rdc_bootstrap) + +set(POLICY_EXAMPLE_SRC_LIST "policy_example.cc") +cmake_print_variables(POLICY_EXAMPLE_SRC_LIST) +set(POLICY_EXAMPLE_EXE "policy") +add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}") +target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/example/policy_example.cc b/example/policy_example.cc new file mode 100644 index 0000000000..437bdb7ef0 --- /dev/null +++ b/example/policy_example.cc @@ -0,0 +1,195 @@ +#include + +#include + +#include "rdc/rdc.h" + +static const char* condition_type_to_str(rdc_policy_condition_type_t type) { + if (type == RDC_POLICY_COND_MAX_PAGE_RETRIED) return "Retried Page Limit"; + if (type == RDC_POLICY_COND_THERMAL) return "Temperature Limit"; + if (type == RDC_POLICY_COND_POWER) return "Power Limit"; + return "Unknown_Type"; +} + +static time_t last_time = 0; // last time to print message +int rdc_policy_callback(rdc_policy_callback_response_t* userData) { + if (userData == nullptr) { + std::cerr << "The rdc_policy_callback returns null data\n"; + return 1; + } + + // To avoid flooding too many messages, only print message every 5 seconds + time_t now = time(NULL); + if (difftime(now, last_time) < 5) { + return 0; + } + std::cout << "The " << condition_type_to_str(userData->condition.type) + << " exceeds the threshold " << userData->condition.value << " with the value " + << userData->value << std::endl; + last_time = now; // update the last time + return 0; +} + +int main() { + rdc_gpu_group_t group_id; + rdc_status_t result; + bool standalone = false; + rdc_handle_t rdc_handle; + uint32_t count = 0; + + char hostIpAddress[] = {"localhost:50051"}; + char group_name[] = {"group1"}; + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n"); + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); + if (result != RDC_ST_OK) { + std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // Get the list of devices in the system + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); + if (result != RDC_ST_OK) { + std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result); + goto cleanup; + } + if (count == 0) { + std::cout << "No GPUs find on the sytem "; + goto cleanup; + } else { + std::cout << count << " GPUs found in the system.\n"; + } + + // Create the group + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Created the GPU group " << group_id << std::endl; + + // Add all GPUs to the group + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " << rdc_status_string(result); + goto cleanup; + } + rdc_device_attributes_t attribute; + result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group " + << group_id << std::endl; + } + + // Define a policy to print out message when temperature is above 30 degree + // or power usage is more than 150W + rdc_policy_t policy; + policy.condition = {RDC_POLICY_COND_THERMAL, 30 * 1000}; // convert to milli degree + policy.action = RDC_POLICY_ACTION_NONE; // Notify only + result = rdc_policy_set(rdc_handle, group_id, policy); + if (result != RDC_ST_OK) { + std::cout << "Error set policy RDC_POLICY_COND_THERMAL, Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + + policy.condition = {RDC_POLICY_COND_POWER, 150000}; // convert to milli degree + policy.action = RDC_POLICY_ACTION_NONE; // Notify only + result = rdc_policy_set(rdc_handle, group_id, policy); + if (result != RDC_ST_OK) { + std::cout << "Error set policy RDC_POLICY_COND_POWER, Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + + policy.condition = {RDC_POLICY_COND_MAX_PAGE_RETRIED, 100}; // convert to milli degree + policy.action = RDC_POLICY_ACTION_NONE; // Notify only + result = rdc_policy_set(rdc_handle, group_id, policy); + if (result != RDC_ST_OK) { + std::cout << "Error set policy RDC_POLICY_COND_MAX_PAGE_RETRIED, Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + + rdc_policy_t policy_get[RDC_MAX_POLICY_SETTINGS]; + result = rdc_policy_get(rdc_handle, group_id, &count, policy_get); + if (result != RDC_ST_OK) { + std::cout << "Error get policy, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + + // Register a function to listen to the events + result = rdc_policy_register(rdc_handle, group_id, rdc_policy_callback); + if (result != RDC_ST_OK) { + std::cout << "Error register policy, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + + std::cout << "Wait 30 seconds for the events happening ...\n" << std::endl; + + // If the events happening, the callback rdc_policy_register_callback will be called. + usleep(30 * 1000000); // sleep 30 seconds + + // Un-register the events + result = rdc_policy_unregister(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error unregister policy, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + + // clear the events + rdc_policy_condition_type_t condition_type; + condition_type = RDC_POLICY_COND_THERMAL; + result = rdc_policy_delete(rdc_handle, group_id, condition_type); + if (result != RDC_ST_OK) { + std::cout << "Error clear policy, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + +//... clean up +cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} \ No newline at end of file diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 28c969df18..ab85e94e63 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -189,6 +189,10 @@ typedef enum { RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage + /** + * @brief GPU page related fields + */ + RDC_FI_GPU_PAGE_RETRIED = 550, //!< Retried page of the GPU instance /** * @brief ECC related fields */ @@ -552,6 +556,31 @@ typedef struct { rdc_diag_test_result_t diag_info[MAX_TEST_CASES]; } rdc_diag_response_t; +/** + * @brief The policy type to support + */ +typedef enum { + RDC_POLICY_COND_MAX_PAGE_RETRIED, //!< Max number of page retired + RDC_POLICY_COND_THERMAL, //!< Temperature threshold, millidegree Celsius + RDC_POLICY_COND_POWER, //!< Power threshold, unit milliwatt + RDC_POLICY_COND_MAX +} rdc_policy_condition_type_t; + +typedef struct { + rdc_policy_condition_type_t type; + int64_t value; +} rdc_policy_condition_t; + +typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_action_t; + +/** + * @brief The structure to define policy to enforce on GPU. + */ +typedef struct { + rdc_policy_condition_t condition; //!< condition to meet + rdc_policy_action_t action; //!< Action to take +} rdc_policy_t; + /** * @brief Initialize ROCm RDC. * @@ -1131,6 +1160,107 @@ rdc_field_t get_field_id_from_name(const char* name); */ const char* rdc_diagnostic_result_string(rdc_diag_result_t result); +/** + * @brief Set the RDC policy. Each group has multiple policies, these policies can be set by this + * API one by one. Multiple calls of this API will override the existing policy. + * + * @details Set the RDC policy + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] policy The policy to set + * + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_policy_t policy); + +#define RDC_MAX_POLICY_SETTINGS 32 + +/** + * @brief Get the RDC policy + * + * @details Get the RDC policy + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[out] count The size of policies array + * + * @param[out] policies The policies to get + * + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]); + +/** + * @brief delete the RDC policy for this group based on condition type + * + * @details clear the RDC policy for this group based on condition type. In a GPU group, only one + * policy can be set for a specific rdc_policy_condition_type_t + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id + * + * @param[in] condition_type The condition type to delete + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ + +rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type); + +/** + * Define the structure is used in RDC policy callback + */ +typedef struct { + unsigned int version; + rdc_policy_condition_t condition; //!< the condition that is meet + rdc_gpu_group_t group_id; //!< The group id trigger this callback + int64_t value; //!< The current value that meet the condition +} rdc_policy_callback_response_t; + +/** + * The user data is the rdc_policy_callback_response_t + */ +typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t* userData); + +/** + * @brief Register a function to be called when policy condition is meet. + * + * @details Register the RDC policy callback + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] callback The callback function to be called when condition meet. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_policy_register_callback callback); + +/** + * @brief un-register a policy callback function for a conditioin. + * + * @details Un-register the policy callback for a condition. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id); + #ifdef __cplusplus } #endif // __cplusplus diff --git a/include/rdc_lib/RdcHandler.h b/include/rdc_lib/RdcHandler.h index 167802fc55..164eae25eb 100644 --- a/include/rdc_lib/RdcHandler.h +++ b/include/rdc_lib/RdcHandler.h @@ -92,6 +92,19 @@ class RdcHandler { // It is just a client interface under the GRPC framework and is not used as an RDC API. // The reason is that RdcEmbeddedHandler::get_mixed_component_version does not need to be called. virtual rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) = 0; + // Policy API + virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0; + + virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0; + + virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) = 0; + + virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) = 0; + + virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0; virtual ~RdcHandler() {} }; diff --git a/include/rdc_lib/RdcPolicy.h b/include/rdc_lib/RdcPolicy.h new file mode 100644 index 0000000000..12df1456c1 --- /dev/null +++ b/include/rdc_lib/RdcPolicy.h @@ -0,0 +1,57 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCPOLICY_H_ +#define INCLUDE_RDC_LIB_RDCPOLICY_H_ + +#include +#include + +#include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +class RdcPolicy { + public: + virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0; + + virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0; + + virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) = 0; + + virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) = 0; + + virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0; + + virtual ~RdcPolicy() {} +}; + +typedef std::shared_ptr RdcPolicyPtr; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_RDCPOLICY_H_ diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index b6463076ed..b4549f016c 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -31,6 +31,7 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricsUpdater.h" #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcNotification.h" +#include "rdc_lib/RdcPolicy.h" #include "rdc_lib/RdcWatchTable.h" namespace amd { @@ -94,6 +95,18 @@ class RdcEmbeddedHandler final : public RdcHandler { // It is just a client interface under the GRPC framework and is not used as an RDC API. // Pure virtual functions need to be overridden. rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override; + // Policy API + rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override; + + rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override; + + rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) override; + + rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, rdc_policy_register_callback callback) override; + + rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override; explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); ~RdcEmbeddedHandler() final; @@ -107,6 +120,7 @@ class RdcEmbeddedHandler final : public RdcHandler { RdcNotificationPtr rdc_notif_; RdcWatchTablePtr watch_table_; RdcMetricsUpdaterPtr metrics_updater_; + RdcPolicyPtr policy_; std::future updater_; }; diff --git a/include/rdc_lib/impl/RdcPolicyImpl.h b/include/rdc_lib/impl/RdcPolicyImpl.h new file mode 100644 index 0000000000..379bfc2716 --- /dev/null +++ b/include/rdc_lib/impl/RdcPolicyImpl.h @@ -0,0 +1,77 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_ + +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include + +#include "amd_smi/amdsmi.h" +#include "rdc_lib/RdcPolicy.h" +#include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/RdcGroupSettings.h" + +namespace amd { +namespace rdc { + +class RdcPolicyImpl : public RdcPolicy { + public: + RdcPolicyImpl(const RdcGroupSettingsPtr& group_settings, const RdcMetricFetcherPtr& metric_fetcher); + ~RdcPolicyImpl(); + + rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override; + + rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override; + + rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) override; + + rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,rdc_policy_register_callback callback) override; + + rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override; + + private: + RdcGroupSettingsPtr group_settings_; + RdcMetricFetcherPtr metric_fetcher_; + std::mutex policy_mutex_; + std::thread thread_; + bool start_; + + std::map > settings_; + std::map callbacks_; + + void rdc_policy_check_condition(); + void rdc_policy_gpu_reset(uint32_t gpu_index); + rdc_policy_register_callback rdc_policy_get_callback(rdc_gpu_group_t group_id); +}; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_ diff --git a/include/rdc_lib/impl/RdcStandaloneHandler.h b/include/rdc_lib/impl/RdcStandaloneHandler.h index 5434b46ff7..d6792432be 100644 --- a/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -24,6 +24,8 @@ THE SOFTWARE. #include #include +#include +#include #include "rdc.grpc.pb.h" // NOLINT #include "rdc_lib/RdcHandler.h" @@ -89,6 +91,18 @@ class RdcStandaloneHandler : public RdcHandler { // It is just a client interface under the GRPC framework and is not used as an RDC API. // Pure virtual functions need to be overridden rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override; + // Policy API + rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override; + + rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override; + + rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) override; + + rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) override; + + rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override; explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca, const char* client_cert, const char* client_key); @@ -100,6 +114,15 @@ class RdcStandaloneHandler : public RdcHandler { bool copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, rdc_gpu_usage_info_t* target); std::unique_ptr<::rdc::RdcAPI::Stub> stub_; + // thread for policy callback + + struct policy_thread_context { + bool start; + std::thread *t; + }; + + std::map policy_threads_; + }; } // namespace rdc diff --git a/protos/rdc.proto b/protos/rdc.proto index eac7165689..d857e3f1ad 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -154,6 +154,40 @@ service RdcAPI { //Just an RPC method not used as an API rpc GetMixedComponentVersion(GetMixedComponentVersionRequest) returns (GetMixedComponentVersionResponse) {} + + // rdc_status_t rdc_policy_set( + // rdc_handle_t p_rdc_handle, + // rdc_gpu_group_t group_id, + // rdc_policy_t policy); + rpc SetPolicy(SetPolicyRequest) returns (SetPolicyResponse) {} + + // rdc_status_t rdc_policy_get( + // rdc_handle_t p_rdc_handle, + // rdc_gpu_group_t group_id, + // uint32_t* count, + // rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]); + rpc GetPolicy(GetPolicyRequest) returns (GetPolicyResponse) {} + + // rdc_status_t rdc_policy_delete( + // rdc_handle_t p_rdc_handle, + // rdc_gpu_group_t group_id, + // rdc_policy_condition_type_t condition_type); + rpc DeletePolicy(DeletePolicyRequest) returns (DeletePolicyResponse) {} + + + // rdc_status_t rdc_policy_register( + // rdc_handle_t p_rdc_handle, + // rdc_gpu_group_t group_id, + // rdc_policy_condition_t condition, + // rdc_policy_register_callback callback); + rpc RegisterPolicy(RegisterPolicyRequest) returns (stream RegisterPolicyResponse) {} + + // rdc_status_t rdc_policy_unregister( + // rdc_handle_t p_rdc_handle, + // rdc_gpu_group_t group_id, + // rdc_policy_condition_t condition); + rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {} + } message Empty { @@ -485,4 +519,100 @@ message GetMixedComponentVersionRequest { message GetMixedComponentVersionResponse { uint32 status = 1; string version = 2; -} \ No newline at end of file +} + + +message PolicyCondition{ + enum Type { + COND_MAX_PAGE_RETRIED = 0; + COND_THERMAL = 1; + COND_POWER = 2; + }; + Type type = 1; + int64 value = 2; +} + + +message Policy{ + PolicyCondition condition = 1; + + enum Action { + ACTION_NONE = 0; + ACTION_GPU_RESET = 1; + }; + Action action = 2; +} + +message SetPolicyResult { + uint32 status = 1; +} + +message SetPolicyRequest { + uint32 group_id = 1; + Policy policy =2; +} + +message SetPolicyResponse { + uint32 status = 1; +} + + +message PolicyResponse { + uint32 count= 1; + repeated Policy policies = 2; +} + +message GetPolicyRequest { + uint32 group_id = 1; + +} + +message GetPolicyResponse { + uint32 status = 1; + PolicyResponse response = 2; +} + +message DeletePolicyRequest { + uint32 group_id = 1; + enum PolicyConditionType{ + RDC_POLICY_COND_MAX_PAGE_RETRIED = 0; + RDC_POLICY_COND_THERMAL = 1; + RDC_POLICY_COND_POWER = 2; +}; + PolicyConditionType condition_type = 2; +} + +message DeletePolicyResponse { + uint32 status = 1; +} + + +message RegisterPolicyResult { + uint32 status = 1; +} + +message RegisterPolicyRequest { + uint32 group_id = 1; + +} + +message RegisterPolicyResponse { + uint32 status = 1; + uint32 version =2; + PolicyCondition condition =3; + uint32 group_id =4; + uint64 value=5; +} + +message UnRegisterPolicyResult { + uint32 status = 1; +} + +message UnRegisterPolicyRequest { + uint32 group_id = 1; + +} + +message UnRegisterPolicyResponse { + uint32 status = 1; +} diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index 91a74f19fa..b8a9ae2c6b 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -409,3 +409,49 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) { dest[n - 1] = '\0'; return dest; } + +rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_policy_t policy) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)->rdc_policy_set(group_id, policy); +} + +rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)->rdc_policy_get(group_id, count, policies); +} + + +rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type){ + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)->rdc_policy_delete(group_id, condition_type); +} + + + +rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + return static_cast(p_rdc_handle) + ->rdc_policy_register(group_id, callback); +} +rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + return static_cast(p_rdc_handle) + ->rdc_policy_unregister(group_id); +} \ No newline at end of file diff --git a/rdc_libs/rdc/CMakeLists.txt b/rdc_libs/rdc/CMakeLists.txt index 7b75656948..b8a338ab35 100644 --- a/rdc_libs/rdc/CMakeLists.txt +++ b/rdc_libs/rdc/CMakeLists.txt @@ -18,6 +18,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/RdcModuleMgrImpl.cc" "${SRC_DIR}/RdcNotificationImpl.cc" "${SRC_DIR}/RdcPerfTimer.cc" + "${SRC_DIR}/RdcPolicyImpl.cc" "${SRC_DIR}/RdcRocpLib.cc" "${SRC_DIR}/RdcRocrLib.cc" "${SRC_DIR}/RdcRVSLib.cc" diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index c7c8465788..8cc3725d52 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -34,6 +34,7 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" #include "rdc_lib/impl/RdcModuleMgrImpl.h" #include "rdc_lib/impl/RdcNotificationImpl.h" +#include "rdc_lib/impl/RdcPolicyImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" #include "rdc_lib/rdc_common.h" @@ -78,7 +79,8 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)), rdc_notif_(new RdcNotificationImpl()), watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)), - metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { + metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)), + policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) { if (mode == RDC_OPERATION_MODE_AUTO) { RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO"); metrics_updater_->start(); @@ -429,5 +431,33 @@ rdc_status_t RdcEmbeddedHandler::get_mixed_component_version(mixed_component_t c return RDC_ST_OK; } +// Policy API +rdc_status_t RdcEmbeddedHandler::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) { + return policy_->rdc_policy_set(group_id, policy); +} + +rdc_status_t RdcEmbeddedHandler::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) { + if (count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + return policy_->rdc_policy_get(group_id, count, policies); +} + +rdc_status_t RdcEmbeddedHandler::rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) { + return policy_->rdc_policy_delete(group_id, condition_type); +} + +rdc_status_t RdcEmbeddedHandler::rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) { + return policy_->rdc_policy_register(group_id, callback); +} + +rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) { + return policy_->rdc_policy_unregister(group_id); +} + } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 6f1ea1620e..cb73b8648b 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -664,6 +664,15 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } break; } + case RDC_FI_GPU_PAGE_RETRIED: + uint32_t num_pages; + amdsmi_retired_page_record_t info; + value->status = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, &info); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = num_pages; + } + break; case RDC_FI_OAM_ID: { amdsmi_asic_info_t asic_info; value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info); diff --git a/rdc_libs/rdc/src/RdcPolicyImpl.cc b/rdc_libs/rdc/src/RdcPolicyImpl.cc new file mode 100644 index 0000000000..29422e07d4 --- /dev/null +++ b/rdc_libs/rdc/src/RdcPolicyImpl.cc @@ -0,0 +1,258 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "rdc_lib/impl/RdcPolicyImpl.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/SmiUtils.h" +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcPolicyImpl::RdcPolicyImpl(const RdcGroupSettingsPtr& group_settings, + const RdcMetricFetcherPtr& metric_fetcher) + : group_settings_(group_settings), metric_fetcher_(metric_fetcher), start_(true) { + thread_ = std::thread([this]() { + while (start_) { + rdc_policy_check_condition(); + usleep(500); // 500ms + } + }); +} + +RdcPolicyImpl::~RdcPolicyImpl() { + start_ = false; + thread_.join(); +} + +rdc_status_t RdcPolicyImpl::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) { + rdc_status_t status = RDC_ST_NOT_SUPPORTED; + std::lock_guard guard(policy_mutex_); + + // parameters check + if (policy.condition.type >= RDC_POLICY_COND_MAX) { + status = RDC_ST_BAD_PARAMETER; + return status; + } + + if (policy.action > RDC_POLICY_ACTION_GPU_RESET) { + status = RDC_ST_BAD_PARAMETER; + return status; + } + + // check if support RDC_POLICY_COND_MAX_PAGE_RETRIED + if (RDC_POLICY_COND_MAX_PAGE_RETRIED == policy.condition.type) { + uint32_t gpu_index; + rdc_group_info_t group_info; + rdc_field_value value; + + status = group_settings_->rdc_group_gpu_get_info(group_id, &group_info); + for (unsigned int i = 0; i < group_info.count; i++) { + gpu_index = group_info.entity_ids[i]; + + status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_GPU_PAGE_RETRIED, &value); + if (status == RDC_ST_MSI_ERROR) + return RDC_ST_NOT_SUPPORTED; + } + } + + auto it = settings_.find(group_id); + if (it != settings_.end()) { + std::vector& policies = it->second; + + bool exist = false; + for (auto& itpolicy : policies) { + // if exist, overwrite the value and action + if (itpolicy.condition.type == policy.condition.type) { + itpolicy.condition.value = policy.condition.value; + itpolicy.action = policy.action; + exist = true; + } + } + if (!exist) { + policies.push_back(policy); + } + + status = RDC_ST_OK; + } else { + std::vector policies; + policies.push_back(policy); + + settings_.insert(std::make_pair(group_id, policies)); + status = RDC_ST_OK; + } + + return status; +} + +rdc_status_t RdcPolicyImpl::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) { + rdc_status_t status = RDC_ST_NOT_SUPPORTED; + + std::lock_guard guard(policy_mutex_); + + auto it = settings_.find(group_id); + if (it != settings_.end()) { + std::vector& policies_ref = it->second; + uint32_t i = 0; + for (auto itpolicy : policies_ref) { + // if exist + policies[i].condition.type = itpolicy.condition.type; + policies[i].condition.value = itpolicy.condition.value; + policies[i].action = itpolicy.action; + ++i; + } + *count = i; + + status = RDC_ST_OK; + } else { + status = RDC_ST_NOT_FOUND; + } + + return status; +} + +rdc_status_t RdcPolicyImpl::rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) { + rdc_status_t status = RDC_ST_NOT_FOUND; + + std::lock_guard guard(policy_mutex_); + + auto it = settings_.find(group_id); + if (it != settings_.end()) { + std::vector& policies_ref = it->second; + + auto itpolicy = policies_ref.begin(); + while (itpolicy != policies_ref.end()) { + if (itpolicy->condition.type == condition_type) { + status = RDC_ST_OK; + itpolicy = policies_ref.erase(itpolicy); + } else { + ++itpolicy; + } + } + } else { + status = RDC_ST_NOT_FOUND; + } + + return status; +} + +rdc_status_t RdcPolicyImpl::rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) + +{ + rdc_status_t status = RDC_ST_NOT_FOUND; + + std::lock_guard guard(policy_mutex_); + + auto run = callbacks_.find(group_id); + if (run != callbacks_.end()) { + run->second = callback; + status = RDC_ST_OK; + } else { + callbacks_.insert(std::make_pair(group_id, callback)); + status = RDC_ST_OK; + } + return status; +} + +rdc_status_t RdcPolicyImpl::rdc_policy_unregister(rdc_gpu_group_t group_id) { + rdc_status_t status = RDC_ST_OK; + + std::lock_guard guard(policy_mutex_); + + callbacks_.erase(group_id); + return status; +} + +void RdcPolicyImpl::rdc_policy_check_condition() { + // go through the settings + for (auto it : settings_) { + rdc_gpu_group_t group_id = it.first; + std::vector policies = it.second; + rdc_policy_register_callback callback = rdc_policy_get_callback(group_id); + + for (auto policy : policies) { + rdc_status_t status; + rdc_field_value value; + rdc_group_info_t group_info; + uint32_t gpu_index; + + status = group_settings_->rdc_group_gpu_get_info(group_id, &group_info); + for (unsigned int i = 0; i < group_info.count; i++) { + rdc_field_t map[RDC_POLICY_COND_MAX] = {RDC_FI_GPU_PAGE_RETRIED, RDC_FI_GPU_TEMP, + RDC_FI_POWER_USAGE}; + + gpu_index = group_info.entity_ids[i]; + status = metric_fetcher_->fetch_smi_field(gpu_index, map[policy.condition.type], &value); + if (status == RDC_ST_OK) { + if (value.value.l_int > policy.condition.value) { + if (RDC_POLICY_ACTION_GPU_RESET == policy.action) { + rdc_policy_gpu_reset(gpu_index); + } + + // callback if needed + if (callback) { + rdc_policy_callback_response_t response = {1, policy.condition, group_id, + value.value.l_int}; + callback(&response); + } + } + } + } + } + } +} + +rdc_policy_register_callback RdcPolicyImpl::rdc_policy_get_callback(rdc_gpu_group_t group_id) { + rdc_policy_register_callback cb = nullptr; + auto it = callbacks_.find(group_id); + if (it != callbacks_.end()) { + cb = it->second; + } + return cb; +} + +void RdcPolicyImpl::rdc_policy_gpu_reset(uint32_t gpu_index) { + amdsmi_processor_handle processor_handle = {}; + + amdsmi_status_t err = get_processor_handle_from_id(gpu_index, &processor_handle); + if (err == AMDSMI_STATUS_SUCCESS) { + amdsmi_reset_gpu(processor_handle); + } +} + +} // namespace rdc +} // namespace amd diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 1664ebe4ec..3f523d5bf6 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -23,6 +23,8 @@ THE SOFTWARE. #include +#include + #include "rdc.grpc.pb.h" // NOLINT amd::rdc::RdcHandler* make_handler(const char* ip_and_port, const char* root_ca, @@ -243,8 +245,8 @@ rdc_status_t RdcStandaloneHandler::rdc_device_get_attributes(uint32_t gpu_index, return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) { - +rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version( + rdc_component_t component, rdc_component_version_t* p_rdc_compv) { if (!p_rdc_compv) { return RDC_ST_BAD_PARAMETER; } @@ -663,8 +665,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_update_all(uint32_t wait_for_update } // It is only an interface for the client under the GRPC framework and is not used as an RDC API. -rdc_status_t RdcStandaloneHandler::get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) { - +rdc_status_t RdcStandaloneHandler::get_mixed_component_version( + mixed_component_t component, mixed_component_version_t* p_mixed_compv) { if (!p_mixed_compv) { return RDC_ST_BAD_PARAMETER; } @@ -681,7 +683,153 @@ rdc_status_t RdcStandaloneHandler::get_mixed_component_version(mixed_component_t strncpy_with_null(p_mixed_compv->version, reply.version().c_str(), USR_MAX_VERSION_STR_LENGTH); return RDC_ST_OK; +} +// Policy RdcAPI +rdc_status_t RdcStandaloneHandler::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) { + ::rdc::SetPolicyRequest request; + ::rdc::SetPolicyResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + auto to_policy = request.mutable_policy(); + to_policy->set_action(static_cast<::rdc::Policy_Action>(policy.action)); + + auto to_condition = to_policy->mutable_condition(); + + to_condition->set_type(static_cast<::rdc::PolicyCondition_Type>(policy.condition.type)); + to_condition->set_value(policy.condition.value); + + // call gRPC + ::grpc::Status status = stub_->SetPolicy(&context, request, &reply); + + return error_handle(status, reply.status()); +} + +rdc_status_t RdcStandaloneHandler::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) { + ::rdc::GetPolicyRequest request; + ::rdc::GetPolicyResponse reply; + ::grpc::ClientContext context; + + if (count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + request.set_group_id(group_id); + + // call gRPC + ::grpc::Status status = stub_->GetPolicy(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + auto response = reply.response(); + uint32_t policy_count = response.count(); + + for (uint32_t i = 0; i < policy_count; ++i) { + const ::rdc::Policy& policy = response.policies(i); + + ::rdc::PolicyCondition cond = policy.condition(); + policies[i].condition.type = static_cast(cond.type()); + policies[i].condition.value = cond.value(); + policies[i].action = static_cast(policy.action()); + } + + *count = policy_count; + + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_policy_delete(rdc_gpu_group_t group_id, + rdc_policy_condition_type_t condition_type) { + ::rdc::DeletePolicyRequest request; + ::rdc::DeletePolicyResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + + request.set_condition_type( + static_cast<::rdc::DeletePolicyRequest_PolicyConditionType>(condition_type)); + + // call gRPC + ::grpc::Status status = stub_->DeletePolicy(&context, request, &reply); + + return error_handle(status, reply.status()); +} + +rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) { + // check if a thread for a group is already registered + auto it = policy_threads_.find(group_id); + if (it != policy_threads_.end()) { + return RDC_ST_CONFLICT; + } + + // no registered callback, start the thread to read the stream from rdcd + struct policy_thread_context ctx = {true,nullptr}; + + ctx.t = new std::thread([this, group_id, callback]() { + // call rdcd + ::rdc::RegisterPolicyRequest request; + ::rdc::RegisterPolicyResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + + // call to gRPC + std::unique_ptr> reader( + stub_->RegisterPolicy(&context, request)); + + bool start = true; + while (start) { + auto it = policy_threads_.find(group_id); + if (it != policy_threads_.end()) { + if (it->second.start == false) start = false; + } else { + start = false; + } + + if (reader->Read(&reply)) { + reply.status(); + ::rdc::PolicyCondition cond = reply.condition(); + + rdc_policy_callback_response_t response; + response.version = reply.version(); + response.condition.type = static_cast(cond.type()); + response.condition.value = cond.value(); + response.group_id = reply.group_id(); + response.value = reply.value(); + + callback(&response); + } + } + + reader->Finish(); + }); + + policy_threads_.insert(std::make_pair(group_id, ctx)); + + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) { + ::rdc::UnRegisterPolicyRequest request; + ::rdc::UnRegisterPolicyResponse reply; + ::grpc::ClientContext context; + + // stop the assocaticted thread of a group + auto it = policy_threads_.find(group_id); + if (it != policy_threads_.end()) { + struct policy_thread_context& ctx = it->second; + ctx.start = false; + } + + // construcut the request + request.set_group_id(group_id); + + // call gRPC + ::grpc::Status status = stub_->UnRegisterPolicy(&context, request, &reply); + return error_handle(status, reply.status()); } } // namespace rdc diff --git a/server/include/rdc/rdc_api_service.h b/server/include/rdc/rdc_api_service.h index a91ee90661..ef38f66cca 100644 --- a/server/include/rdc/rdc_api_service.h +++ b/server/include/rdc/rdc_api_service.h @@ -22,6 +22,8 @@ THE SOFTWARE. #ifndef SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_ #define SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_ +#include + #include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc.h" @@ -34,6 +36,7 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { ~RdcAPIServiceImpl(); rdc_status_t Initialize(uint64_t rdcd_init_flags = 0); + void Shutdown(); ::grpc::Status GetAllDevices(::grpc::ServerContext* context, const ::rdc::Empty* request, ::rdc::GetAllDevicesResponse* reply) override; @@ -42,8 +45,9 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::GetDeviceAttributesRequest* request, ::rdc::GetDeviceAttributesResponse* reply) override; - ::grpc::Status GetComponentVersion(::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request, - ::rdc::GetComponentVersionResponse* reply) override; + ::grpc::Status GetComponentVersion(::grpc::ServerContext* context, + const ::rdc::GetComponentVersionRequest* request, + ::rdc::GetComponentVersionResponse* reply) override; ::grpc::Status CreateGpuGroup(::grpc::ServerContext* context, const ::rdc::CreateGpuGroupRequest* request, @@ -125,11 +129,41 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::DiagnosticTestCaseRunRequest* request, ::rdc::DiagnosticTestCaseRunResponse* reply) override; - ::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context, const ::rdc::GetMixedComponentVersionRequest* request, - ::rdc::GetMixedComponentVersionResponse* reply) override; + ::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context, + const ::rdc::GetMixedComponentVersionRequest* request, + ::rdc::GetMixedComponentVersionResponse* reply) override; + + ::grpc::Status SetPolicy(::grpc::ServerContext* context, const ::rdc::SetPolicyRequest* request, + ::rdc::SetPolicyResponse* reply) override; + + ::grpc::Status GetPolicy(::grpc::ServerContext* context, const ::rdc::GetPolicyRequest* request, + ::rdc::GetPolicyResponse* reply) override; + + ::grpc::Status DeletePolicy(::grpc::ServerContext* context, + const ::rdc::DeletePolicyRequest* request, + ::rdc::DeletePolicyResponse* reply) override; + + ::grpc::Status RegisterPolicy( + ::grpc::ServerContext* context, const ::rdc::RegisterPolicyRequest* request, + ::grpc::ServerWriter< ::rdc::RegisterPolicyResponse>* stream) override; + + ::grpc::Status UnRegisterPolicy(::grpc::ServerContext* context, + const ::rdc::UnRegisterPolicyRequest* request, + ::rdc::UnRegisterPolicyResponse* reply) override; + private: bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target); rdc_handle_t rdc_handle_; + + struct policy_thread_context { + pthread_mutex_t mutex; + pthread_cond_t cond; + rdc_policy_callback_response_t response; + bool start; + }; + // map for group_id and thread context + static std::map policy_threads_; + static int PolicyCallback(rdc_policy_callback_response_t* userData); }; } // namespace rdc diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index 66c86d57da..a88acbe290 100644 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -20,11 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc/rdc_api_service.h" -#include "rdc/rdc_server_main.h" + #include #include #include +#include #include #include #include @@ -32,12 +33,15 @@ THE SOFTWARE. #include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc.h" #include "rdc/rdc_private.h" +#include "rdc/rdc_server_main.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { +std::map RdcAPIServiceImpl::policy_threads_; + RdcAPIServiceImpl::RdcAPIServiceImpl() : rdc_handle_(nullptr) {} rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) { @@ -55,6 +59,14 @@ rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) { return result; } +void RdcAPIServiceImpl::Shutdown() { + // exit policy threads + for (auto it : policy_threads_) { + policy_thread_context* ctx = it.second; + ctx->start = false; + } +} + RdcAPIServiceImpl::~RdcAPIServiceImpl() { if (rdc_handle_) { rdc_stop_embedded(rdc_handle_); @@ -104,9 +116,9 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetComponentVersion(::grpc::ServerContext* context, - const ::rdc::GetComponentVersionRequest* request, - ::rdc::GetComponentVersionResponse* reply) { +::grpc::Status RdcAPIServiceImpl::GetComponentVersion( + ::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request, + ::rdc::GetComponentVersionResponse* reply) { (void)(context); if (!reply) { return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply"); @@ -661,9 +673,9 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetMixedComponentVersion(::grpc::ServerContext* context, - const ::rdc::GetMixedComponentVersionRequest* request, - ::rdc::GetMixedComponentVersionResponse* reply) { +::grpc::Status RdcAPIServiceImpl::GetMixedComponentVersion( + ::grpc::ServerContext* context, const ::rdc::GetMixedComponentVersionRequest* request, + ::rdc::GetMixedComponentVersionResponse* reply) { (void)(context); if (!reply) { return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply"); @@ -687,9 +699,185 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, reply->set_status(RDC_ST_OK); return ::grpc::Status::OK; } else { - return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT, "The provided request parameters are invalid"); + return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT, + "The provided request parameters are invalid"); } } +::grpc::Status RdcAPIServiceImpl::SetPolicy(::grpc::ServerContext* context, + const ::rdc::SetPolicyRequest* request, + ::rdc::SetPolicyResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_policy_t policy; + // constructure the policy + ::rdc::Policy p = request->policy(); + + ::rdc::PolicyCondition cond = p.condition(); + policy.condition.type = static_cast(cond.type()); + policy.condition.value = cond.value(); + policy.action = static_cast(p.action()); + + // call RDC Policy API + rdc_status_t result = rdc_policy_set(rdc_handle_, request->group_id(), policy); + + // set status + reply->set_status(result); + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::GetPolicy(::grpc::ServerContext* context, + const ::rdc::GetPolicyRequest* request, + ::rdc::GetPolicyResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + uint32_t count = 0; + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]; + + // call RDC Policy API + rdc_status_t result = rdc_policy_get(rdc_handle_, request->group_id(), &count, policies); + + // set status + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + + ::rdc::PolicyResponse* to_response = reply->mutable_response(); + to_response->set_count(count); + for (uint32_t i = 0; i < count; i++) { + const rdc_policy_t& policy_ref = policies[i]; + ::rdc::Policy* policy = to_response->add_policies(); + policy->set_action(static_cast<::rdc::Policy_Action>(policy_ref.action)); + + auto to_conditon = policy->mutable_condition(); + + to_conditon->set_type(static_cast<::rdc::PolicyCondition_Type>(policy_ref.condition.type)); + to_conditon->set_value(policy_ref.condition.value); + } + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::DeletePolicy(::grpc::ServerContext* context, + const ::rdc::DeletePolicyRequest* request, + ::rdc::DeletePolicyResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + // call RDC Policy API + rdc_status_t result = + rdc_policy_delete(rdc_handle_, request->group_id(), + static_cast(request->condition_type())); + + // set status + reply->set_status(result); + + return ::grpc::Status::OK; +} + +int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) { + + if (userData == nullptr) { + std::cerr << "The rdc_policy_callback returns null data\n"; + return 1; + } + + auto it = policy_threads_.find(userData->group_id); + if (it != policy_threads_.end()) { + policy_thread_context* ctx = it->second; + + pthread_mutex_lock(&ctx->mutex); + ctx->response = *userData; + pthread_cond_signal(&ctx->cond); + pthread_mutex_unlock(&ctx->mutex); + } + + return 0; +} + +::grpc::Status RdcAPIServiceImpl::RegisterPolicy( + ::grpc::ServerContext* context, const ::rdc::RegisterPolicyRequest* request, + ::grpc::ServerWriter<::rdc::RegisterPolicyResponse>* writer) { + (void)(context); + if (!writer || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + policy_thread_context* data = new policy_thread_context; + data->mutex = PTHREAD_MUTEX_INITIALIZER; + data->cond = PTHREAD_COND_INITIALIZER; + data->start = true; + policy_threads_.insert(std::make_pair(request->group_id(), data)); + + auto updater = std::async(std::launch::async, [this, request, writer]() { + rdc_status_t result = rdc_policy_register(rdc_handle_, request->group_id(), PolicyCallback); + if (result == RDC_ST_OK) { + auto it = policy_threads_.find(request->group_id()); + if (it != policy_threads_.end()) { + policy_thread_context* ctx = it->second; + while (ctx->start) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + + ts.tv_sec += 1; + + pthread_mutex_lock(&ctx->mutex); + int rc = pthread_cond_timedwait(&ctx->cond, &ctx->mutex, &ts); // wait for the callback + if (rc == ETIMEDOUT) { + // timeout; + } else if (rc == 0) { + // reply + ::rdc::RegisterPolicyResponse reply; + reply.set_status(RDC_ST_OK); + reply.set_version(ctx->response.version); + reply.set_group_id(ctx->response.group_id); + reply.set_value(ctx->response.value); + + ::rdc::PolicyCondition* cond = reply.mutable_condition(); + cond->set_type(static_cast<::rdc::PolicyCondition_Type>(ctx->response.condition.type)); + cond->set_value(ctx->response.condition.value); + + writer->Write(reply); + } + + pthread_mutex_unlock(&ctx->mutex); + } + } + + } + }); + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::UnRegisterPolicy(::grpc::ServerContext* context, + const ::rdc::UnRegisterPolicyRequest* request, + ::rdc::UnRegisterPolicyResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_status_t result = rdc_policy_unregister(rdc_handle_, request->group_id()); + if (result == RDC_ST_OK) { + auto it = policy_threads_.find(request->group_id()); + if (it != policy_threads_.end()) { + policy_thread_context* ctx = it->second; + ctx->start = false; + } + } + return ::grpc::Status::OK; +} + } // namespace rdc } // namespace amd diff --git a/server/src/rdc_server_main.cc b/server/src/rdc_server_main.cc index e72fb47758..766aaf9b30 100644 --- a/server/src/rdc_server_main.cc +++ b/server/src/rdc_server_main.cc @@ -271,6 +271,8 @@ static int FileOwner(const char* fn, std::string* owner) { } void RDCServer::ShutDown(void) { + api_service_->Shutdown(); + server_->Shutdown(); if (rdc_admin_service_) {