Enable RDC policy feature
1. Add policy APIs
2. Add policy example for policy API usage
Change-Id: I14deb7c809d0b865b7bb083842092fc37868025e
Signed-off-by: Chao Fei <Chao.Fei@amd.com>
[ROCm/rdc commit: 345ac64a43]
Этот коммит содержится в:
@@ -53,6 +53,8 @@ FLD_DESC_ENT(RDC_FI_GPU_MM_ENC_UTIL, "Mutilmedia encoder busy percentage",
|
||||
FLD_DESC_ENT(RDC_FI_GPU_MM_DEC_UTIL, "Mutilmedia decoder busy percentage", "GPU_MM_DEC_UTIL", true)
|
||||
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_ACTIVITY, "Memory busy percentage", "GPU_MEM_UTIL", true)
|
||||
|
||||
FLD_DESC_ENT(RDC_FI_GPU_PAGE_RETRIED, "Retried page of the GPU instance", "GPU_PAGE_RETRIED", true)
|
||||
|
||||
// ECC totals
|
||||
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
|
||||
|
||||
@@ -113,6 +113,13 @@ set(ROCPROFILER_EXAMPLE_EXE "rocprofiler")
|
||||
add_executable(${ROCPROFILER_EXAMPLE_EXE} "${ROCPROFILER_EXAMPLE_SRC_LIST}")
|
||||
target_link_libraries(${ROCPROFILER_EXAMPLE_EXE} pthread dl rdc_bootstrap)
|
||||
|
||||
|
||||
set(POLICY_EXAMPLE_SRC_LIST "policy_example.cc")
|
||||
cmake_print_variables(POLICY_EXAMPLE_SRC_LIST)
|
||||
set(POLICY_EXAMPLE_EXE "policy")
|
||||
add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
|
||||
target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Finished Cmake Example ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
@@ -0,0 +1,195 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
static const char* condition_type_to_str(rdc_policy_condition_type_t type) {
|
||||
if (type == RDC_POLICY_COND_MAX_PAGE_RETRIED) return "Retried Page Limit";
|
||||
if (type == RDC_POLICY_COND_THERMAL) return "Temperature Limit";
|
||||
if (type == RDC_POLICY_COND_POWER) return "Power Limit";
|
||||
return "Unknown_Type";
|
||||
}
|
||||
|
||||
static time_t last_time = 0; // last time to print message
|
||||
int rdc_policy_callback(rdc_policy_callback_response_t* userData) {
|
||||
if (userData == nullptr) {
|
||||
std::cerr << "The rdc_policy_callback returns null data\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
// To avoid flooding too many messages, only print message every 5 seconds
|
||||
time_t now = time(NULL);
|
||||
if (difftime(now, last_time) < 5) {
|
||||
return 0;
|
||||
}
|
||||
std::cout << "The " << condition_type_to_str(userData->condition.type)
|
||||
<< " exceeds the threshold " << userData->condition.value << " with the value "
|
||||
<< userData->value << std::endl;
|
||||
last_time = now; // update the last time
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() {
|
||||
rdc_gpu_group_t group_id;
|
||||
rdc_status_t result;
|
||||
bool standalone = false;
|
||||
rdc_handle_t rdc_handle;
|
||||
uint32_t count = 0;
|
||||
|
||||
char hostIpAddress[] = {"localhost:50051"};
|
||||
char group_name[] = {"group1"};
|
||||
|
||||
// Select the embedded mode and standalone mode dynamically.
|
||||
std::cout << "Start rdci in: \n";
|
||||
std::cout << "0 - Embedded mode \n";
|
||||
std::cout << "1 - Standalone mode \n";
|
||||
while (!(std::cin >> standalone)) {
|
||||
std::cout << "Invalid input.\n";
|
||||
std::cin.clear();
|
||||
std::cin.ignore();
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");
|
||||
|
||||
// Init the rdc
|
||||
result = rdc_init(0);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
} else {
|
||||
std::cout << "RDC Initialized.\n";
|
||||
}
|
||||
|
||||
if (standalone) { // standalone
|
||||
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
} else { // embedded
|
||||
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
// Now we can use the same API for both standalone and embedded
|
||||
// Get the list of devices in the system
|
||||
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
|
||||
result = rdc_device_get_all(rdc_handle, gpu_index_list, &count);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
if (count == 0) {
|
||||
std::cout << "No GPUs find on the sytem ";
|
||||
goto cleanup;
|
||||
} else {
|
||||
std::cout << count << " GPUs found in the system.\n";
|
||||
}
|
||||
|
||||
// Create the group
|
||||
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error creating group. Return: " << rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
std::cout << "Created the GPU group " << group_id << std::endl;
|
||||
|
||||
// Add all GPUs to the group
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error adding group. Return: " << rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
rdc_device_attributes_t attribute;
|
||||
result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group "
|
||||
<< group_id << std::endl;
|
||||
}
|
||||
|
||||
// Define a policy to print out message when temperature is above 30 degree
|
||||
// or power usage is more than 150W
|
||||
rdc_policy_t policy;
|
||||
policy.condition = {RDC_POLICY_COND_THERMAL, 30 * 1000}; // convert to milli degree
|
||||
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
|
||||
result = rdc_policy_set(rdc_handle, group_id, policy);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error set policy RDC_POLICY_COND_THERMAL, Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
policy.condition = {RDC_POLICY_COND_POWER, 150000}; // convert to milli degree
|
||||
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
|
||||
result = rdc_policy_set(rdc_handle, group_id, policy);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error set policy RDC_POLICY_COND_POWER, Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
policy.condition = {RDC_POLICY_COND_MAX_PAGE_RETRIED, 100}; // convert to milli degree
|
||||
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
|
||||
result = rdc_policy_set(rdc_handle, group_id, policy);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error set policy RDC_POLICY_COND_MAX_PAGE_RETRIED, Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rdc_policy_t policy_get[RDC_MAX_POLICY_SETTINGS];
|
||||
result = rdc_policy_get(rdc_handle, group_id, &count, policy_get);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error get policy, Return: " << rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Register a function to listen to the events
|
||||
result = rdc_policy_register(rdc_handle, group_id, rdc_policy_callback);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error register policy, Return: " << rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
std::cout << "Wait 30 seconds for the events happening ...\n" << std::endl;
|
||||
|
||||
// If the events happening, the callback rdc_policy_register_callback will be called.
|
||||
usleep(30 * 1000000); // sleep 30 seconds
|
||||
|
||||
// Un-register the events
|
||||
result = rdc_policy_unregister(rdc_handle, group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error unregister policy, Return: " << rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// clear the events
|
||||
rdc_policy_condition_type_t condition_type;
|
||||
condition_type = RDC_POLICY_COND_THERMAL;
|
||||
result = rdc_policy_delete(rdc_handle, group_id, condition_type);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error clear policy, Return: " << rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
//... clean up
|
||||
cleanup:
|
||||
std::cout << "Cleaning up.\n";
|
||||
if (standalone)
|
||||
rdc_disconnect(rdc_handle);
|
||||
else
|
||||
rdc_stop_embedded(rdc_handle);
|
||||
rdc_shutdown();
|
||||
return result;
|
||||
}
|
||||
@@ -189,6 +189,10 @@ typedef enum {
|
||||
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
|
||||
RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage
|
||||
|
||||
/**
|
||||
* @brief GPU page related fields
|
||||
*/
|
||||
RDC_FI_GPU_PAGE_RETRIED = 550, //!< Retried page of the GPU instance
|
||||
/**
|
||||
* @brief ECC related fields
|
||||
*/
|
||||
@@ -552,6 +556,31 @@ typedef struct {
|
||||
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
|
||||
} rdc_diag_response_t;
|
||||
|
||||
/**
|
||||
* @brief The policy type to support
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_POLICY_COND_MAX_PAGE_RETRIED, //!< Max number of page retired
|
||||
RDC_POLICY_COND_THERMAL, //!< Temperature threshold, millidegree Celsius
|
||||
RDC_POLICY_COND_POWER, //!< Power threshold, unit milliwatt
|
||||
RDC_POLICY_COND_MAX
|
||||
} rdc_policy_condition_type_t;
|
||||
|
||||
typedef struct {
|
||||
rdc_policy_condition_type_t type;
|
||||
int64_t value;
|
||||
} rdc_policy_condition_t;
|
||||
|
||||
typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_action_t;
|
||||
|
||||
/**
|
||||
* @brief The structure to define policy to enforce on GPU.
|
||||
*/
|
||||
typedef struct {
|
||||
rdc_policy_condition_t condition; //!< condition to meet
|
||||
rdc_policy_action_t action; //!< Action to take
|
||||
} rdc_policy_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize ROCm RDC.
|
||||
*
|
||||
@@ -1131,6 +1160,107 @@ rdc_field_t get_field_id_from_name(const char* name);
|
||||
*/
|
||||
const char* rdc_diagnostic_result_string(rdc_diag_result_t result);
|
||||
|
||||
/**
|
||||
* @brief Set the RDC policy. Each group has multiple policies, these policies can be set by this
|
||||
* API one by one. Multiple calls of this API will override the existing policy.
|
||||
*
|
||||
* @details Set the RDC policy
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[in] policy The policy to set
|
||||
*
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_t policy);
|
||||
|
||||
#define RDC_MAX_POLICY_SETTINGS 32
|
||||
|
||||
/**
|
||||
* @brief Get the RDC policy
|
||||
*
|
||||
* @details Get the RDC policy
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[out] count The size of policies array
|
||||
*
|
||||
* @param[out] policies The policies to get
|
||||
*
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);
|
||||
|
||||
/**
|
||||
* @brief delete the RDC policy for this group based on condition type
|
||||
*
|
||||
* @details clear the RDC policy for this group based on condition type. In a GPU group, only one
|
||||
* policy can be set for a specific rdc_policy_condition_type_t
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id
|
||||
*
|
||||
* @param[in] condition_type The condition type to delete
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
|
||||
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type);
|
||||
|
||||
/**
|
||||
* Define the structure is used in RDC policy callback
|
||||
*/
|
||||
typedef struct {
|
||||
unsigned int version;
|
||||
rdc_policy_condition_t condition; //!< the condition that is meet
|
||||
rdc_gpu_group_t group_id; //!< The group id trigger this callback
|
||||
int64_t value; //!< The current value that meet the condition
|
||||
} rdc_policy_callback_response_t;
|
||||
|
||||
/**
|
||||
* The user data is the rdc_policy_callback_response_t
|
||||
*/
|
||||
typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t* userData);
|
||||
|
||||
/**
|
||||
* @brief Register a function to be called when policy condition is meet.
|
||||
*
|
||||
* @details Register the RDC policy callback
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[in] callback The callback function to be called when condition meet.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback);
|
||||
|
||||
/**
|
||||
* @brief un-register a policy callback function for a conditioin.
|
||||
*
|
||||
* @details Un-register the policy callback for a condition.
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -92,6 +92,19 @@ class RdcHandler {
|
||||
// It is just a client interface under the GRPC framework and is not used as an RDC API.
|
||||
// The reason is that RdcEmbeddedHandler::get_mixed_component_version does not need to be called.
|
||||
virtual rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) = 0;
|
||||
// Policy API
|
||||
virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;
|
||||
|
||||
virtual ~RdcHandler() {}
|
||||
};
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_RDCPOLICY_H_
|
||||
#define INCLUDE_RDC_LIB_RDCPOLICY_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcPolicy {
|
||||
public:
|
||||
virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;
|
||||
|
||||
virtual ~RdcPolicy() {}
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdcPolicy> RdcPolicyPtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_RDCPOLICY_H_
|
||||
@@ -31,6 +31,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcMetricsUpdater.h"
|
||||
#include "rdc_lib/RdcModuleMgr.h"
|
||||
#include "rdc_lib/RdcNotification.h"
|
||||
#include "rdc_lib/RdcPolicy.h"
|
||||
#include "rdc_lib/RdcWatchTable.h"
|
||||
|
||||
namespace amd {
|
||||
@@ -94,6 +95,18 @@ class RdcEmbeddedHandler final : public RdcHandler {
|
||||
// It is just a client interface under the GRPC framework and is not used as an RDC API.
|
||||
// Pure virtual functions need to be overridden.
|
||||
rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override;
|
||||
// Policy API
|
||||
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
|
||||
|
||||
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
|
||||
|
||||
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) override;
|
||||
|
||||
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, rdc_policy_register_callback callback) override;
|
||||
|
||||
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
|
||||
|
||||
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
|
||||
~RdcEmbeddedHandler() final;
|
||||
@@ -107,6 +120,7 @@ class RdcEmbeddedHandler final : public RdcHandler {
|
||||
RdcNotificationPtr rdc_notif_;
|
||||
RdcWatchTablePtr watch_table_;
|
||||
RdcMetricsUpdaterPtr metrics_updater_;
|
||||
RdcPolicyPtr policy_;
|
||||
std::future<void> updater_;
|
||||
};
|
||||
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_
|
||||
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex> // NOLINT
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <future>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc_lib/RdcPolicy.h"
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcGroupSettings.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcPolicyImpl : public RdcPolicy {
|
||||
public:
|
||||
RdcPolicyImpl(const RdcGroupSettingsPtr& group_settings, const RdcMetricFetcherPtr& metric_fetcher);
|
||||
~RdcPolicyImpl();
|
||||
|
||||
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
|
||||
|
||||
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
|
||||
|
||||
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) override;
|
||||
|
||||
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,rdc_policy_register_callback callback) override;
|
||||
|
||||
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
|
||||
|
||||
private:
|
||||
RdcGroupSettingsPtr group_settings_;
|
||||
RdcMetricFetcherPtr metric_fetcher_;
|
||||
std::mutex policy_mutex_;
|
||||
std::thread thread_;
|
||||
bool start_;
|
||||
|
||||
std::map<rdc_gpu_group_t, std::vector<rdc_policy_t> > settings_;
|
||||
std::map<rdc_gpu_group_t, rdc_policy_register_callback> callbacks_;
|
||||
|
||||
void rdc_policy_check_condition();
|
||||
void rdc_policy_gpu_reset(uint32_t gpu_index);
|
||||
rdc_policy_register_callback rdc_policy_get_callback(rdc_gpu_group_t group_id);
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_
|
||||
@@ -24,6 +24,8 @@ THE SOFTWARE.
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <memory>
|
||||
#include <future>
|
||||
#include <thread>
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc_lib/RdcHandler.h"
|
||||
@@ -89,6 +91,18 @@ class RdcStandaloneHandler : public RdcHandler {
|
||||
// It is just a client interface under the GRPC framework and is not used as an RDC API.
|
||||
// Pure virtual functions need to be overridden
|
||||
rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override;
|
||||
// Policy API
|
||||
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
|
||||
|
||||
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
|
||||
|
||||
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) override;
|
||||
|
||||
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback) override;
|
||||
|
||||
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
|
||||
|
||||
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
|
||||
const char* client_cert, const char* client_key);
|
||||
@@ -100,6 +114,15 @@ class RdcStandaloneHandler : public RdcHandler {
|
||||
bool copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, rdc_gpu_usage_info_t* target);
|
||||
|
||||
std::unique_ptr<::rdc::RdcAPI::Stub> stub_;
|
||||
// thread for policy callback
|
||||
|
||||
struct policy_thread_context {
|
||||
bool start;
|
||||
std::thread *t;
|
||||
};
|
||||
|
||||
std::map<uint32_t, struct policy_thread_context> policy_threads_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -154,6 +154,40 @@ service RdcAPI {
|
||||
|
||||
//Just an RPC method not used as an API
|
||||
rpc GetMixedComponentVersion(GetMixedComponentVersionRequest) returns (GetMixedComponentVersionResponse) {}
|
||||
|
||||
// rdc_status_t rdc_policy_set(
|
||||
// rdc_handle_t p_rdc_handle,
|
||||
// rdc_gpu_group_t group_id,
|
||||
// rdc_policy_t policy);
|
||||
rpc SetPolicy(SetPolicyRequest) returns (SetPolicyResponse) {}
|
||||
|
||||
// rdc_status_t rdc_policy_get(
|
||||
// rdc_handle_t p_rdc_handle,
|
||||
// rdc_gpu_group_t group_id,
|
||||
// uint32_t* count,
|
||||
// rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);
|
||||
rpc GetPolicy(GetPolicyRequest) returns (GetPolicyResponse) {}
|
||||
|
||||
// rdc_status_t rdc_policy_delete(
|
||||
// rdc_handle_t p_rdc_handle,
|
||||
// rdc_gpu_group_t group_id,
|
||||
// rdc_policy_condition_type_t condition_type);
|
||||
rpc DeletePolicy(DeletePolicyRequest) returns (DeletePolicyResponse) {}
|
||||
|
||||
|
||||
// rdc_status_t rdc_policy_register(
|
||||
// rdc_handle_t p_rdc_handle,
|
||||
// rdc_gpu_group_t group_id,
|
||||
// rdc_policy_condition_t condition,
|
||||
// rdc_policy_register_callback callback);
|
||||
rpc RegisterPolicy(RegisterPolicyRequest) returns (stream RegisterPolicyResponse) {}
|
||||
|
||||
// rdc_status_t rdc_policy_unregister(
|
||||
// rdc_handle_t p_rdc_handle,
|
||||
// rdc_gpu_group_t group_id,
|
||||
// rdc_policy_condition_t condition);
|
||||
rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {}
|
||||
|
||||
}
|
||||
|
||||
message Empty {
|
||||
@@ -485,4 +519,100 @@ message GetMixedComponentVersionRequest {
|
||||
message GetMixedComponentVersionResponse {
|
||||
uint32 status = 1;
|
||||
string version = 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
message PolicyCondition{
|
||||
enum Type {
|
||||
COND_MAX_PAGE_RETRIED = 0;
|
||||
COND_THERMAL = 1;
|
||||
COND_POWER = 2;
|
||||
};
|
||||
Type type = 1;
|
||||
int64 value = 2;
|
||||
}
|
||||
|
||||
|
||||
message Policy{
|
||||
PolicyCondition condition = 1;
|
||||
|
||||
enum Action {
|
||||
ACTION_NONE = 0;
|
||||
ACTION_GPU_RESET = 1;
|
||||
};
|
||||
Action action = 2;
|
||||
}
|
||||
|
||||
message SetPolicyResult {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
message SetPolicyRequest {
|
||||
uint32 group_id = 1;
|
||||
Policy policy =2;
|
||||
}
|
||||
|
||||
message SetPolicyResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
|
||||
message PolicyResponse {
|
||||
uint32 count= 1;
|
||||
repeated Policy policies = 2;
|
||||
}
|
||||
|
||||
message GetPolicyRequest {
|
||||
uint32 group_id = 1;
|
||||
|
||||
}
|
||||
|
||||
message GetPolicyResponse {
|
||||
uint32 status = 1;
|
||||
PolicyResponse response = 2;
|
||||
}
|
||||
|
||||
message DeletePolicyRequest {
|
||||
uint32 group_id = 1;
|
||||
enum PolicyConditionType{
|
||||
RDC_POLICY_COND_MAX_PAGE_RETRIED = 0;
|
||||
RDC_POLICY_COND_THERMAL = 1;
|
||||
RDC_POLICY_COND_POWER = 2;
|
||||
};
|
||||
PolicyConditionType condition_type = 2;
|
||||
}
|
||||
|
||||
message DeletePolicyResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
|
||||
message RegisterPolicyResult {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
message RegisterPolicyRequest {
|
||||
uint32 group_id = 1;
|
||||
|
||||
}
|
||||
|
||||
message RegisterPolicyResponse {
|
||||
uint32 status = 1;
|
||||
uint32 version =2;
|
||||
PolicyCondition condition =3;
|
||||
uint32 group_id =4;
|
||||
uint64 value=5;
|
||||
}
|
||||
|
||||
message UnRegisterPolicyResult {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
message UnRegisterPolicyRequest {
|
||||
uint32 group_id = 1;
|
||||
|
||||
}
|
||||
|
||||
message UnRegisterPolicyResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
@@ -409,3 +409,49 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) {
|
||||
dest[n - 1] = '\0';
|
||||
return dest;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_t policy) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_set(group_id, policy);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_get(group_id, count, policies);
|
||||
}
|
||||
|
||||
|
||||
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type){
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_delete(group_id, condition_type);
|
||||
}
|
||||
|
||||
|
||||
|
||||
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_policy_register(group_id, callback);
|
||||
}
|
||||
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_policy_unregister(group_id);
|
||||
}
|
||||
@@ -18,6 +18,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
|
||||
"${SRC_DIR}/RdcModuleMgrImpl.cc"
|
||||
"${SRC_DIR}/RdcNotificationImpl.cc"
|
||||
"${SRC_DIR}/RdcPerfTimer.cc"
|
||||
"${SRC_DIR}/RdcPolicyImpl.cc"
|
||||
"${SRC_DIR}/RdcRocpLib.cc"
|
||||
"${SRC_DIR}/RdcRocrLib.cc"
|
||||
"${SRC_DIR}/RdcRVSLib.cc"
|
||||
|
||||
@@ -34,6 +34,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
|
||||
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
|
||||
#include "rdc_lib/impl/RdcNotificationImpl.h"
|
||||
#include "rdc_lib/impl/RdcPolicyImpl.h"
|
||||
#include "rdc_lib/impl/RdcWatchTableImpl.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
@@ -78,7 +79,8 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
|
||||
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
|
||||
rdc_notif_(new RdcNotificationImpl()),
|
||||
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)),
|
||||
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) {
|
||||
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)),
|
||||
policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) {
|
||||
if (mode == RDC_OPERATION_MODE_AUTO) {
|
||||
RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO");
|
||||
metrics_updater_->start();
|
||||
@@ -429,5 +431,33 @@ rdc_status_t RdcEmbeddedHandler::get_mixed_component_version(mixed_component_t c
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
// Policy API
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) {
|
||||
return policy_->rdc_policy_set(group_id, policy);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
|
||||
if (count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
return policy_->rdc_policy_get(group_id, count, policies);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) {
|
||||
return policy_->rdc_policy_delete(group_id, condition_type);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_policy_register(rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback) {
|
||||
return policy_->rdc_policy_register(group_id, callback);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) {
|
||||
return policy_->rdc_policy_unregister(group_id);
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -664,6 +664,15 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_PAGE_RETRIED:
|
||||
uint32_t num_pages;
|
||||
amdsmi_retired_page_record_t info;
|
||||
value->status = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, &info);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = num_pages;
|
||||
}
|
||||
break;
|
||||
case RDC_FI_OAM_ID: {
|
||||
amdsmi_asic_info_t asic_info;
|
||||
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
|
||||
@@ -0,0 +1,258 @@
|
||||
/*
|
||||
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rdc_lib/impl/RdcPolicyImpl.h"
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <ctime>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "common/rdc_utils.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdcPolicyImpl::RdcPolicyImpl(const RdcGroupSettingsPtr& group_settings,
|
||||
const RdcMetricFetcherPtr& metric_fetcher)
|
||||
: group_settings_(group_settings), metric_fetcher_(metric_fetcher), start_(true) {
|
||||
thread_ = std::thread([this]() {
|
||||
while (start_) {
|
||||
rdc_policy_check_condition();
|
||||
usleep(500); // 500ms
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
RdcPolicyImpl::~RdcPolicyImpl() {
|
||||
start_ = false;
|
||||
thread_.join();
|
||||
}
|
||||
|
||||
rdc_status_t RdcPolicyImpl::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) {
|
||||
rdc_status_t status = RDC_ST_NOT_SUPPORTED;
|
||||
std::lock_guard<std::mutex> guard(policy_mutex_);
|
||||
|
||||
// parameters check
|
||||
if (policy.condition.type >= RDC_POLICY_COND_MAX) {
|
||||
status = RDC_ST_BAD_PARAMETER;
|
||||
return status;
|
||||
}
|
||||
|
||||
if (policy.action > RDC_POLICY_ACTION_GPU_RESET) {
|
||||
status = RDC_ST_BAD_PARAMETER;
|
||||
return status;
|
||||
}
|
||||
|
||||
// check if support RDC_POLICY_COND_MAX_PAGE_RETRIED
|
||||
if (RDC_POLICY_COND_MAX_PAGE_RETRIED == policy.condition.type) {
|
||||
uint32_t gpu_index;
|
||||
rdc_group_info_t group_info;
|
||||
rdc_field_value value;
|
||||
|
||||
status = group_settings_->rdc_group_gpu_get_info(group_id, &group_info);
|
||||
for (unsigned int i = 0; i < group_info.count; i++) {
|
||||
gpu_index = group_info.entity_ids[i];
|
||||
|
||||
status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_GPU_PAGE_RETRIED, &value);
|
||||
if (status == RDC_ST_MSI_ERROR)
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
|
||||
auto it = settings_.find(group_id);
|
||||
if (it != settings_.end()) {
|
||||
std::vector<rdc_policy_t>& policies = it->second;
|
||||
|
||||
bool exist = false;
|
||||
for (auto& itpolicy : policies) {
|
||||
// if exist, overwrite the value and action
|
||||
if (itpolicy.condition.type == policy.condition.type) {
|
||||
itpolicy.condition.value = policy.condition.value;
|
||||
itpolicy.action = policy.action;
|
||||
exist = true;
|
||||
}
|
||||
}
|
||||
if (!exist) {
|
||||
policies.push_back(policy);
|
||||
}
|
||||
|
||||
status = RDC_ST_OK;
|
||||
} else {
|
||||
std::vector<rdc_policy_t> policies;
|
||||
policies.push_back(policy);
|
||||
|
||||
settings_.insert(std::make_pair(group_id, policies));
|
||||
status = RDC_ST_OK;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcPolicyImpl::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
|
||||
rdc_status_t status = RDC_ST_NOT_SUPPORTED;
|
||||
|
||||
std::lock_guard<std::mutex> guard(policy_mutex_);
|
||||
|
||||
auto it = settings_.find(group_id);
|
||||
if (it != settings_.end()) {
|
||||
std::vector<rdc_policy_t>& policies_ref = it->second;
|
||||
uint32_t i = 0;
|
||||
for (auto itpolicy : policies_ref) {
|
||||
// if exist
|
||||
policies[i].condition.type = itpolicy.condition.type;
|
||||
policies[i].condition.value = itpolicy.condition.value;
|
||||
policies[i].action = itpolicy.action;
|
||||
++i;
|
||||
}
|
||||
*count = i;
|
||||
|
||||
status = RDC_ST_OK;
|
||||
} else {
|
||||
status = RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcPolicyImpl::rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) {
|
||||
rdc_status_t status = RDC_ST_NOT_FOUND;
|
||||
|
||||
std::lock_guard<std::mutex> guard(policy_mutex_);
|
||||
|
||||
auto it = settings_.find(group_id);
|
||||
if (it != settings_.end()) {
|
||||
std::vector<rdc_policy_t>& policies_ref = it->second;
|
||||
|
||||
auto itpolicy = policies_ref.begin();
|
||||
while (itpolicy != policies_ref.end()) {
|
||||
if (itpolicy->condition.type == condition_type) {
|
||||
status = RDC_ST_OK;
|
||||
itpolicy = policies_ref.erase(itpolicy);
|
||||
} else {
|
||||
++itpolicy;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
status = RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcPolicyImpl::rdc_policy_register(rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback)
|
||||
|
||||
{
|
||||
rdc_status_t status = RDC_ST_NOT_FOUND;
|
||||
|
||||
std::lock_guard<std::mutex> guard(policy_mutex_);
|
||||
|
||||
auto run = callbacks_.find(group_id);
|
||||
if (run != callbacks_.end()) {
|
||||
run->second = callback;
|
||||
status = RDC_ST_OK;
|
||||
} else {
|
||||
callbacks_.insert(std::make_pair(group_id, callback));
|
||||
status = RDC_ST_OK;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcPolicyImpl::rdc_policy_unregister(rdc_gpu_group_t group_id) {
|
||||
rdc_status_t status = RDC_ST_OK;
|
||||
|
||||
std::lock_guard<std::mutex> guard(policy_mutex_);
|
||||
|
||||
callbacks_.erase(group_id);
|
||||
return status;
|
||||
}
|
||||
|
||||
void RdcPolicyImpl::rdc_policy_check_condition() {
|
||||
// go through the settings
|
||||
for (auto it : settings_) {
|
||||
rdc_gpu_group_t group_id = it.first;
|
||||
std::vector<rdc_policy_t> policies = it.second;
|
||||
rdc_policy_register_callback callback = rdc_policy_get_callback(group_id);
|
||||
|
||||
for (auto policy : policies) {
|
||||
rdc_status_t status;
|
||||
rdc_field_value value;
|
||||
rdc_group_info_t group_info;
|
||||
uint32_t gpu_index;
|
||||
|
||||
status = group_settings_->rdc_group_gpu_get_info(group_id, &group_info);
|
||||
for (unsigned int i = 0; i < group_info.count; i++) {
|
||||
rdc_field_t map[RDC_POLICY_COND_MAX] = {RDC_FI_GPU_PAGE_RETRIED, RDC_FI_GPU_TEMP,
|
||||
RDC_FI_POWER_USAGE};
|
||||
|
||||
gpu_index = group_info.entity_ids[i];
|
||||
status = metric_fetcher_->fetch_smi_field(gpu_index, map[policy.condition.type], &value);
|
||||
if (status == RDC_ST_OK) {
|
||||
if (value.value.l_int > policy.condition.value) {
|
||||
if (RDC_POLICY_ACTION_GPU_RESET == policy.action) {
|
||||
rdc_policy_gpu_reset(gpu_index);
|
||||
}
|
||||
|
||||
// callback if needed
|
||||
if (callback) {
|
||||
rdc_policy_callback_response_t response = {1, policy.condition, group_id,
|
||||
value.value.l_int};
|
||||
callback(&response);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rdc_policy_register_callback RdcPolicyImpl::rdc_policy_get_callback(rdc_gpu_group_t group_id) {
|
||||
rdc_policy_register_callback cb = nullptr;
|
||||
auto it = callbacks_.find(group_id);
|
||||
if (it != callbacks_.end()) {
|
||||
cb = it->second;
|
||||
}
|
||||
return cb;
|
||||
}
|
||||
|
||||
void RdcPolicyImpl::rdc_policy_gpu_reset(uint32_t gpu_index) {
|
||||
amdsmi_processor_handle processor_handle = {};
|
||||
|
||||
amdsmi_status_t err = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
amdsmi_reset_gpu(processor_handle);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -23,6 +23,8 @@ THE SOFTWARE.
|
||||
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <future>
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
|
||||
amd::rdc::RdcHandler* make_handler(const char* ip_and_port, const char* root_ca,
|
||||
@@ -243,8 +245,8 @@ rdc_status_t RdcStandaloneHandler::rdc_device_get_attributes(uint32_t gpu_index,
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version(
|
||||
rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
|
||||
if (!p_rdc_compv) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -663,8 +665,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_update_all(uint32_t wait_for_update
|
||||
}
|
||||
|
||||
// It is only an interface for the client under the GRPC framework and is not used as an RDC API.
|
||||
rdc_status_t RdcStandaloneHandler::get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::get_mixed_component_version(
|
||||
mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
|
||||
if (!p_mixed_compv) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -681,7 +683,153 @@ rdc_status_t RdcStandaloneHandler::get_mixed_component_version(mixed_component_t
|
||||
|
||||
strncpy_with_null(p_mixed_compv->version, reply.version().c_str(), USR_MAX_VERSION_STR_LENGTH);
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
// Policy RdcAPI
|
||||
rdc_status_t RdcStandaloneHandler::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) {
|
||||
::rdc::SetPolicyRequest request;
|
||||
::rdc::SetPolicyResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
auto to_policy = request.mutable_policy();
|
||||
to_policy->set_action(static_cast<::rdc::Policy_Action>(policy.action));
|
||||
|
||||
auto to_condition = to_policy->mutable_condition();
|
||||
|
||||
to_condition->set_type(static_cast<::rdc::PolicyCondition_Type>(policy.condition.type));
|
||||
to_condition->set_value(policy.condition.value);
|
||||
|
||||
// call gRPC
|
||||
::grpc::Status status = stub_->SetPolicy(&context, request, &reply);
|
||||
|
||||
return error_handle(status, reply.status());
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
|
||||
::rdc::GetPolicyRequest request;
|
||||
::rdc::GetPolicyResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
if (count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
request.set_group_id(group_id);
|
||||
|
||||
// call gRPC
|
||||
::grpc::Status status = stub_->GetPolicy(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
if (err_status != RDC_ST_OK) return err_status;
|
||||
|
||||
auto response = reply.response();
|
||||
uint32_t policy_count = response.count();
|
||||
|
||||
for (uint32_t i = 0; i < policy_count; ++i) {
|
||||
const ::rdc::Policy& policy = response.policies(i);
|
||||
|
||||
::rdc::PolicyCondition cond = policy.condition();
|
||||
policies[i].condition.type = static_cast<rdc_policy_condition_type_t>(cond.type());
|
||||
policies[i].condition.value = cond.value();
|
||||
policies[i].action = static_cast<rdc_policy_action_t>(policy.action());
|
||||
}
|
||||
|
||||
*count = policy_count;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_policy_delete(rdc_gpu_group_t group_id,
|
||||
rdc_policy_condition_type_t condition_type) {
|
||||
::rdc::DeletePolicyRequest request;
|
||||
::rdc::DeletePolicyResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
|
||||
request.set_condition_type(
|
||||
static_cast<::rdc::DeletePolicyRequest_PolicyConditionType>(condition_type));
|
||||
|
||||
// call gRPC
|
||||
::grpc::Status status = stub_->DeletePolicy(&context, request, &reply);
|
||||
|
||||
return error_handle(status, reply.status());
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id,
|
||||
rdc_policy_register_callback callback) {
|
||||
// check if a thread for a group is already registered
|
||||
auto it = policy_threads_.find(group_id);
|
||||
if (it != policy_threads_.end()) {
|
||||
return RDC_ST_CONFLICT;
|
||||
}
|
||||
|
||||
// no registered callback, start the thread to read the stream from rdcd
|
||||
struct policy_thread_context ctx = {true,nullptr};
|
||||
|
||||
ctx.t = new std::thread([this, group_id, callback]() {
|
||||
// call rdcd
|
||||
::rdc::RegisterPolicyRequest request;
|
||||
::rdc::RegisterPolicyResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
|
||||
// call to gRPC
|
||||
std::unique_ptr<grpc::ClientReader<::rdc::RegisterPolicyResponse>> reader(
|
||||
stub_->RegisterPolicy(&context, request));
|
||||
|
||||
bool start = true;
|
||||
while (start) {
|
||||
auto it = policy_threads_.find(group_id);
|
||||
if (it != policy_threads_.end()) {
|
||||
if (it->second.start == false) start = false;
|
||||
} else {
|
||||
start = false;
|
||||
}
|
||||
|
||||
if (reader->Read(&reply)) {
|
||||
reply.status();
|
||||
::rdc::PolicyCondition cond = reply.condition();
|
||||
|
||||
rdc_policy_callback_response_t response;
|
||||
response.version = reply.version();
|
||||
response.condition.type = static_cast<rdc_policy_condition_type_t>(cond.type());
|
||||
response.condition.value = cond.value();
|
||||
response.group_id = reply.group_id();
|
||||
response.value = reply.value();
|
||||
|
||||
callback(&response);
|
||||
}
|
||||
}
|
||||
|
||||
reader->Finish();
|
||||
});
|
||||
|
||||
policy_threads_.insert(std::make_pair(group_id, ctx));
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) {
|
||||
::rdc::UnRegisterPolicyRequest request;
|
||||
::rdc::UnRegisterPolicyResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
// stop the assocaticted thread of a group
|
||||
auto it = policy_threads_.find(group_id);
|
||||
if (it != policy_threads_.end()) {
|
||||
struct policy_thread_context& ctx = it->second;
|
||||
ctx.start = false;
|
||||
}
|
||||
|
||||
// construcut the request
|
||||
request.set_group_id(group_id);
|
||||
|
||||
// call gRPC
|
||||
::grpc::Status status = stub_->UnRegisterPolicy(&context, request, &reply);
|
||||
return error_handle(status, reply.status());
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -22,6 +22,8 @@ THE SOFTWARE.
|
||||
#ifndef SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_
|
||||
#define SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
@@ -34,6 +36,7 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
|
||||
~RdcAPIServiceImpl();
|
||||
|
||||
rdc_status_t Initialize(uint64_t rdcd_init_flags = 0);
|
||||
void Shutdown();
|
||||
|
||||
::grpc::Status GetAllDevices(::grpc::ServerContext* context, const ::rdc::Empty* request,
|
||||
::rdc::GetAllDevicesResponse* reply) override;
|
||||
@@ -42,8 +45,9 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
|
||||
const ::rdc::GetDeviceAttributesRequest* request,
|
||||
::rdc::GetDeviceAttributesResponse* reply) override;
|
||||
|
||||
::grpc::Status GetComponentVersion(::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request,
|
||||
::rdc::GetComponentVersionResponse* reply) override;
|
||||
::grpc::Status GetComponentVersion(::grpc::ServerContext* context,
|
||||
const ::rdc::GetComponentVersionRequest* request,
|
||||
::rdc::GetComponentVersionResponse* reply) override;
|
||||
|
||||
::grpc::Status CreateGpuGroup(::grpc::ServerContext* context,
|
||||
const ::rdc::CreateGpuGroupRequest* request,
|
||||
@@ -125,11 +129,41 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
|
||||
const ::rdc::DiagnosticTestCaseRunRequest* request,
|
||||
::rdc::DiagnosticTestCaseRunResponse* reply) override;
|
||||
|
||||
::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context, const ::rdc::GetMixedComponentVersionRequest* request,
|
||||
::rdc::GetMixedComponentVersionResponse* reply) override;
|
||||
::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context,
|
||||
const ::rdc::GetMixedComponentVersionRequest* request,
|
||||
::rdc::GetMixedComponentVersionResponse* reply) override;
|
||||
|
||||
::grpc::Status SetPolicy(::grpc::ServerContext* context, const ::rdc::SetPolicyRequest* request,
|
||||
::rdc::SetPolicyResponse* reply) override;
|
||||
|
||||
::grpc::Status GetPolicy(::grpc::ServerContext* context, const ::rdc::GetPolicyRequest* request,
|
||||
::rdc::GetPolicyResponse* reply) override;
|
||||
|
||||
::grpc::Status DeletePolicy(::grpc::ServerContext* context,
|
||||
const ::rdc::DeletePolicyRequest* request,
|
||||
::rdc::DeletePolicyResponse* reply) override;
|
||||
|
||||
::grpc::Status RegisterPolicy(
|
||||
::grpc::ServerContext* context, const ::rdc::RegisterPolicyRequest* request,
|
||||
::grpc::ServerWriter< ::rdc::RegisterPolicyResponse>* stream) override;
|
||||
|
||||
::grpc::Status UnRegisterPolicy(::grpc::ServerContext* context,
|
||||
const ::rdc::UnRegisterPolicyRequest* request,
|
||||
::rdc::UnRegisterPolicyResponse* reply) override;
|
||||
|
||||
private:
|
||||
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
|
||||
rdc_handle_t rdc_handle_;
|
||||
|
||||
struct policy_thread_context {
|
||||
pthread_mutex_t mutex;
|
||||
pthread_cond_t cond;
|
||||
rdc_policy_callback_response_t response;
|
||||
bool start;
|
||||
};
|
||||
// map for group_id and thread context
|
||||
static std::map<uint32_t, struct policy_thread_context*> policy_threads_;
|
||||
static int PolicyCallback(rdc_policy_callback_response_t* userData);
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -20,11 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc/rdc_api_service.h"
|
||||
#include "rdc/rdc_server_main.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <csignal>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
@@ -32,12 +33,15 @@ THE SOFTWARE.
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc/rdc_private.h"
|
||||
#include "rdc/rdc_server_main.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
std::map<uint32_t, RdcAPIServiceImpl::policy_thread_context*> RdcAPIServiceImpl::policy_threads_;
|
||||
|
||||
RdcAPIServiceImpl::RdcAPIServiceImpl() : rdc_handle_(nullptr) {}
|
||||
|
||||
rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) {
|
||||
@@ -55,6 +59,14 @@ rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) {
|
||||
return result;
|
||||
}
|
||||
|
||||
void RdcAPIServiceImpl::Shutdown() {
|
||||
// exit policy threads
|
||||
for (auto it : policy_threads_) {
|
||||
policy_thread_context* ctx = it.second;
|
||||
ctx->start = false;
|
||||
}
|
||||
}
|
||||
|
||||
RdcAPIServiceImpl::~RdcAPIServiceImpl() {
|
||||
if (rdc_handle_) {
|
||||
rdc_stop_embedded(rdc_handle_);
|
||||
@@ -104,9 +116,9 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() {
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::GetComponentVersion(::grpc::ServerContext* context,
|
||||
const ::rdc::GetComponentVersionRequest* request,
|
||||
::rdc::GetComponentVersionResponse* reply) {
|
||||
::grpc::Status RdcAPIServiceImpl::GetComponentVersion(
|
||||
::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request,
|
||||
::rdc::GetComponentVersionResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply");
|
||||
@@ -661,9 +673,9 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::GetMixedComponentVersion(::grpc::ServerContext* context,
|
||||
const ::rdc::GetMixedComponentVersionRequest* request,
|
||||
::rdc::GetMixedComponentVersionResponse* reply) {
|
||||
::grpc::Status RdcAPIServiceImpl::GetMixedComponentVersion(
|
||||
::grpc::ServerContext* context, const ::rdc::GetMixedComponentVersionRequest* request,
|
||||
::rdc::GetMixedComponentVersionResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply");
|
||||
@@ -687,9 +699,185 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
|
||||
reply->set_status(RDC_ST_OK);
|
||||
return ::grpc::Status::OK;
|
||||
} else {
|
||||
return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT, "The provided request parameters are invalid");
|
||||
return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
|
||||
"The provided request parameters are invalid");
|
||||
}
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::SetPolicy(::grpc::ServerContext* context,
|
||||
const ::rdc::SetPolicyRequest* request,
|
||||
::rdc::SetPolicyResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
rdc_policy_t policy;
|
||||
// constructure the policy
|
||||
::rdc::Policy p = request->policy();
|
||||
|
||||
::rdc::PolicyCondition cond = p.condition();
|
||||
policy.condition.type = static_cast<rdc_policy_condition_type_t>(cond.type());
|
||||
policy.condition.value = cond.value();
|
||||
policy.action = static_cast<rdc_policy_action_t>(p.action());
|
||||
|
||||
// call RDC Policy API
|
||||
rdc_status_t result = rdc_policy_set(rdc_handle_, request->group_id(), policy);
|
||||
|
||||
// set status
|
||||
reply->set_status(result);
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::GetPolicy(::grpc::ServerContext* context,
|
||||
const ::rdc::GetPolicyRequest* request,
|
||||
::rdc::GetPolicyResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
uint32_t count = 0;
|
||||
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS];
|
||||
|
||||
// call RDC Policy API
|
||||
rdc_status_t result = rdc_policy_get(rdc_handle_, request->group_id(), &count, policies);
|
||||
|
||||
// set status
|
||||
reply->set_status(result);
|
||||
if (result != RDC_ST_OK) {
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::rdc::PolicyResponse* to_response = reply->mutable_response();
|
||||
to_response->set_count(count);
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
const rdc_policy_t& policy_ref = policies[i];
|
||||
::rdc::Policy* policy = to_response->add_policies();
|
||||
policy->set_action(static_cast<::rdc::Policy_Action>(policy_ref.action));
|
||||
|
||||
auto to_conditon = policy->mutable_condition();
|
||||
|
||||
to_conditon->set_type(static_cast<::rdc::PolicyCondition_Type>(policy_ref.condition.type));
|
||||
to_conditon->set_value(policy_ref.condition.value);
|
||||
}
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::DeletePolicy(::grpc::ServerContext* context,
|
||||
const ::rdc::DeletePolicyRequest* request,
|
||||
::rdc::DeletePolicyResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
// call RDC Policy API
|
||||
rdc_status_t result =
|
||||
rdc_policy_delete(rdc_handle_, request->group_id(),
|
||||
static_cast<rdc_policy_condition_type_t>(request->condition_type()));
|
||||
|
||||
// set status
|
||||
reply->set_status(result);
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) {
|
||||
|
||||
if (userData == nullptr) {
|
||||
std::cerr << "The rdc_policy_callback returns null data\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto it = policy_threads_.find(userData->group_id);
|
||||
if (it != policy_threads_.end()) {
|
||||
policy_thread_context* ctx = it->second;
|
||||
|
||||
pthread_mutex_lock(&ctx->mutex);
|
||||
ctx->response = *userData;
|
||||
pthread_cond_signal(&ctx->cond);
|
||||
pthread_mutex_unlock(&ctx->mutex);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::RegisterPolicy(
|
||||
::grpc::ServerContext* context, const ::rdc::RegisterPolicyRequest* request,
|
||||
::grpc::ServerWriter<::rdc::RegisterPolicyResponse>* writer) {
|
||||
(void)(context);
|
||||
if (!writer || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
policy_thread_context* data = new policy_thread_context;
|
||||
data->mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
data->cond = PTHREAD_COND_INITIALIZER;
|
||||
data->start = true;
|
||||
policy_threads_.insert(std::make_pair(request->group_id(), data));
|
||||
|
||||
auto updater = std::async(std::launch::async, [this, request, writer]() {
|
||||
rdc_status_t result = rdc_policy_register(rdc_handle_, request->group_id(), PolicyCallback);
|
||||
if (result == RDC_ST_OK) {
|
||||
auto it = policy_threads_.find(request->group_id());
|
||||
if (it != policy_threads_.end()) {
|
||||
policy_thread_context* ctx = it->second;
|
||||
while (ctx->start) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
|
||||
ts.tv_sec += 1;
|
||||
|
||||
pthread_mutex_lock(&ctx->mutex);
|
||||
int rc = pthread_cond_timedwait(&ctx->cond, &ctx->mutex, &ts); // wait for the callback
|
||||
if (rc == ETIMEDOUT) {
|
||||
// timeout;
|
||||
} else if (rc == 0) {
|
||||
// reply
|
||||
::rdc::RegisterPolicyResponse reply;
|
||||
reply.set_status(RDC_ST_OK);
|
||||
reply.set_version(ctx->response.version);
|
||||
reply.set_group_id(ctx->response.group_id);
|
||||
reply.set_value(ctx->response.value);
|
||||
|
||||
::rdc::PolicyCondition* cond = reply.mutable_condition();
|
||||
cond->set_type(static_cast<::rdc::PolicyCondition_Type>(ctx->response.condition.type));
|
||||
cond->set_value(ctx->response.condition.value);
|
||||
|
||||
writer->Write(reply);
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&ctx->mutex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::UnRegisterPolicy(::grpc::ServerContext* context,
|
||||
const ::rdc::UnRegisterPolicyRequest* request,
|
||||
::rdc::UnRegisterPolicyResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
rdc_status_t result = rdc_policy_unregister(rdc_handle_, request->group_id());
|
||||
if (result == RDC_ST_OK) {
|
||||
auto it = policy_threads_.find(request->group_id());
|
||||
if (it != policy_threads_.end()) {
|
||||
policy_thread_context* ctx = it->second;
|
||||
ctx->start = false;
|
||||
}
|
||||
}
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -271,6 +271,8 @@ static int FileOwner(const char* fn, std::string* owner) {
|
||||
}
|
||||
|
||||
void RDCServer::ShutDown(void) {
|
||||
api_service_->Shutdown();
|
||||
|
||||
server_->Shutdown();
|
||||
|
||||
if (rdc_admin_service_) {
|
||||
|
||||
Ссылка в новой задаче
Block a user