Enable RDC policy feature

1. Add policy APIs
2. Add policy example for policy API usage

Change-Id: I14deb7c809d0b865b7bb083842092fc37868025e
Signed-off-by: Chao Fei <Chao.Fei@amd.com>
This commit is contained in:
Chao Fei
2024-08-27 11:46:40 +08:00
parent 4bd31b605a
commit 345ac64a43
19 changed files with 1382 additions and 18 deletions
+2
View File
@@ -53,6 +53,8 @@ FLD_DESC_ENT(RDC_FI_GPU_MM_ENC_UTIL, "Mutilmedia encoder busy percentage",
FLD_DESC_ENT(RDC_FI_GPU_MM_DEC_UTIL, "Mutilmedia decoder busy percentage", "GPU_MM_DEC_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_ACTIVITY, "Memory busy percentage", "GPU_MEM_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_PAGE_RETRIED, "Retried page of the GPU instance", "GPU_PAGE_RETRIED", true)
// ECC totals
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
+7
View File
@@ -113,6 +113,13 @@ set(ROCPROFILER_EXAMPLE_EXE "rocprofiler")
add_executable(${ROCPROFILER_EXAMPLE_EXE} "${ROCPROFILER_EXAMPLE_SRC_LIST}")
target_link_libraries(${ROCPROFILER_EXAMPLE_EXE} pthread dl rdc_bootstrap)
set(POLICY_EXAMPLE_SRC_LIST "policy_example.cc")
cmake_print_variables(POLICY_EXAMPLE_SRC_LIST)
set(POLICY_EXAMPLE_EXE "policy")
add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Example ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
+195
View File
@@ -0,0 +1,195 @@
#include <unistd.h>
#include <iostream>
#include "rdc/rdc.h"
static const char* condition_type_to_str(rdc_policy_condition_type_t type) {
if (type == RDC_POLICY_COND_MAX_PAGE_RETRIED) return "Retried Page Limit";
if (type == RDC_POLICY_COND_THERMAL) return "Temperature Limit";
if (type == RDC_POLICY_COND_POWER) return "Power Limit";
return "Unknown_Type";
}
static time_t last_time = 0; // last time to print message
int rdc_policy_callback(rdc_policy_callback_response_t* userData) {
if (userData == nullptr) {
std::cerr << "The rdc_policy_callback returns null data\n";
return 1;
}
// To avoid flooding too many messages, only print message every 5 seconds
time_t now = time(NULL);
if (difftime(now, last_time) < 5) {
return 0;
}
std::cout << "The " << condition_type_to_str(userData->condition.type)
<< " exceeds the threshold " << userData->condition.value << " with the value "
<< userData->value << std::endl;
last_time = now; // update the last time
return 0;
}
int main() {
rdc_gpu_group_t group_id;
rdc_status_t result;
bool standalone = false;
rdc_handle_t rdc_handle;
uint32_t count = 0;
char hostIpAddress[] = {"localhost:50051"};
char group_name[] = {"group1"};
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");
// Init the rdc
result = rdc_init(0);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
}
// Now we can use the same API for both standalone and embedded
// Get the list of devices in the system
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
result = rdc_device_get_all(rdc_handle, gpu_index_list, &count);
if (result != RDC_ST_OK) {
std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result);
goto cleanup;
}
if (count == 0) {
std::cout << "No GPUs find on the sytem ";
goto cleanup;
} else {
std::cout << count << " GPUs found in the system.\n";
}
// Create the group
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Created the GPU group " << group_id << std::endl;
// Add all GPUs to the group
for (uint32_t i = 0; i < count; i++) {
result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: " << rdc_status_string(result);
goto cleanup;
}
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute);
if (result != RDC_ST_OK) {
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group "
<< group_id << std::endl;
}
// Define a policy to print out message when temperature is above 30 degree
// or power usage is more than 150W
rdc_policy_t policy;
policy.condition = {RDC_POLICY_COND_THERMAL, 30 * 1000}; // convert to milli degree
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
result = rdc_policy_set(rdc_handle, group_id, policy);
if (result != RDC_ST_OK) {
std::cout << "Error set policy RDC_POLICY_COND_THERMAL, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
policy.condition = {RDC_POLICY_COND_POWER, 150000}; // convert to milli degree
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
result = rdc_policy_set(rdc_handle, group_id, policy);
if (result != RDC_ST_OK) {
std::cout << "Error set policy RDC_POLICY_COND_POWER, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
policy.condition = {RDC_POLICY_COND_MAX_PAGE_RETRIED, 100}; // convert to milli degree
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
result = rdc_policy_set(rdc_handle, group_id, policy);
if (result != RDC_ST_OK) {
std::cout << "Error set policy RDC_POLICY_COND_MAX_PAGE_RETRIED, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
rdc_policy_t policy_get[RDC_MAX_POLICY_SETTINGS];
result = rdc_policy_get(rdc_handle, group_id, &count, policy_get);
if (result != RDC_ST_OK) {
std::cout << "Error get policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
// Register a function to listen to the events
result = rdc_policy_register(rdc_handle, group_id, rdc_policy_callback);
if (result != RDC_ST_OK) {
std::cout << "Error register policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
std::cout << "Wait 30 seconds for the events happening ...\n" << std::endl;
// If the events happening, the callback rdc_policy_register_callback will be called.
usleep(30 * 1000000); // sleep 30 seconds
// Un-register the events
result = rdc_policy_unregister(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error unregister policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
// clear the events
rdc_policy_condition_type_t condition_type;
condition_type = RDC_POLICY_COND_THERMAL;
result = rdc_policy_delete(rdc_handle, group_id, condition_type);
if (result != RDC_ST_OK) {
std::cout << "Error clear policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
//... clean up
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
}
+130
View File
@@ -189,6 +189,10 @@ typedef enum {
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage
/**
* @brief GPU page related fields
*/
RDC_FI_GPU_PAGE_RETRIED = 550, //!< Retried page of the GPU instance
/**
* @brief ECC related fields
*/
@@ -552,6 +556,31 @@ typedef struct {
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
} rdc_diag_response_t;
/**
* @brief The policy type to support
*/
typedef enum {
RDC_POLICY_COND_MAX_PAGE_RETRIED, //!< Max number of page retired
RDC_POLICY_COND_THERMAL, //!< Temperature threshold, millidegree Celsius
RDC_POLICY_COND_POWER, //!< Power threshold, unit milliwatt
RDC_POLICY_COND_MAX
} rdc_policy_condition_type_t;
typedef struct {
rdc_policy_condition_type_t type;
int64_t value;
} rdc_policy_condition_t;
typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_action_t;
/**
* @brief The structure to define policy to enforce on GPU.
*/
typedef struct {
rdc_policy_condition_t condition; //!< condition to meet
rdc_policy_action_t action; //!< Action to take
} rdc_policy_t;
/**
* @brief Initialize ROCm RDC.
*
@@ -1131,6 +1160,107 @@ rdc_field_t get_field_id_from_name(const char* name);
*/
const char* rdc_diagnostic_result_string(rdc_diag_result_t result);
/**
* @brief Set the RDC policy. Each group has multiple policies, these policies can be set by this
* API one by one. Multiple calls of this API will override the existing policy.
*
* @details Set the RDC policy
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] policy The policy to set
*
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_t policy);
#define RDC_MAX_POLICY_SETTINGS 32
/**
* @brief Get the RDC policy
*
* @details Get the RDC policy
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[out] count The size of policies array
*
* @param[out] policies The policies to get
*
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);
/**
* @brief delete the RDC policy for this group based on condition type
*
* @details clear the RDC policy for this group based on condition type. In a GPU group, only one
* policy can be set for a specific rdc_policy_condition_type_t
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id
*
* @param[in] condition_type The condition type to delete
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type);
/**
* Define the structure is used in RDC policy callback
*/
typedef struct {
unsigned int version;
rdc_policy_condition_t condition; //!< the condition that is meet
rdc_gpu_group_t group_id; //!< The group id trigger this callback
int64_t value; //!< The current value that meet the condition
} rdc_policy_callback_response_t;
/**
* The user data is the rdc_policy_callback_response_t
*/
typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t* userData);
/**
* @brief Register a function to be called when policy condition is meet.
*
* @details Register the RDC policy callback
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] callback The callback function to be called when condition meet.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_register_callback callback);
/**
* @brief un-register a policy callback function for a conditioin.
*
* @details Un-register the policy callback for a condition.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
#ifdef __cplusplus
}
#endif // __cplusplus
+13
View File
@@ -92,6 +92,19 @@ class RdcHandler {
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// The reason is that RdcEmbeddedHandler::get_mixed_component_version does not need to be called.
virtual rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) = 0;
// Policy API
virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0;
virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0;
virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) = 0;
virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) = 0;
virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;
virtual ~RdcHandler() {}
};
+57
View File
@@ -0,0 +1,57 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_RDCPOLICY_H_
#define INCLUDE_RDC_LIB_RDCPOLICY_H_
#include <memory>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcPolicy {
public:
virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0;
virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0;
virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) = 0;
virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) = 0;
virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;
virtual ~RdcPolicy() {}
};
typedef std::shared_ptr<RdcPolicy> RdcPolicyPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCPOLICY_H_
+14
View File
@@ -31,6 +31,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcMetricsUpdater.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/RdcPolicy.h"
#include "rdc_lib/RdcWatchTable.h"
namespace amd {
@@ -94,6 +95,18 @@ class RdcEmbeddedHandler final : public RdcHandler {
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// Pure virtual functions need to be overridden.
rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override;
// Policy API
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) override;
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, rdc_policy_register_callback callback) override;
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
~RdcEmbeddedHandler() final;
@@ -107,6 +120,7 @@ class RdcEmbeddedHandler final : public RdcHandler {
RdcNotificationPtr rdc_notif_;
RdcWatchTablePtr watch_table_;
RdcMetricsUpdaterPtr metrics_updater_;
RdcPolicyPtr policy_;
std::future<void> updater_;
};
+77
View File
@@ -0,0 +1,77 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_
#include <atomic>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <utility>
#include <vector>
#include <future>
#include "amd_smi/amdsmi.h"
#include "rdc_lib/RdcPolicy.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcGroupSettings.h"
namespace amd {
namespace rdc {
class RdcPolicyImpl : public RdcPolicy {
public:
RdcPolicyImpl(const RdcGroupSettingsPtr& group_settings, const RdcMetricFetcherPtr& metric_fetcher);
~RdcPolicyImpl();
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) override;
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,rdc_policy_register_callback callback) override;
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
private:
RdcGroupSettingsPtr group_settings_;
RdcMetricFetcherPtr metric_fetcher_;
std::mutex policy_mutex_;
std::thread thread_;
bool start_;
std::map<rdc_gpu_group_t, std::vector<rdc_policy_t> > settings_;
std::map<rdc_gpu_group_t, rdc_policy_register_callback> callbacks_;
void rdc_policy_check_condition();
void rdc_policy_gpu_reset(uint32_t gpu_index);
rdc_policy_register_callback rdc_policy_get_callback(rdc_gpu_group_t group_id);
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCPOLICYIMPL_H_
@@ -24,6 +24,8 @@ THE SOFTWARE.
#include <grpcpp/grpcpp.h>
#include <memory>
#include <future>
#include <thread>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc_lib/RdcHandler.h"
@@ -89,6 +91,18 @@ class RdcStandaloneHandler : public RdcHandler {
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// Pure virtual functions need to be overridden
rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override;
// Policy API
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) override;
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) override;
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
const char* client_cert, const char* client_key);
@@ -100,6 +114,15 @@ class RdcStandaloneHandler : public RdcHandler {
bool copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, rdc_gpu_usage_info_t* target);
std::unique_ptr<::rdc::RdcAPI::Stub> stub_;
// thread for policy callback
struct policy_thread_context {
bool start;
std::thread *t;
};
std::map<uint32_t, struct policy_thread_context> policy_threads_;
};
} // namespace rdc
+131 -1
View File
@@ -154,6 +154,40 @@ service RdcAPI {
//Just an RPC method not used as an API
rpc GetMixedComponentVersion(GetMixedComponentVersionRequest) returns (GetMixedComponentVersionResponse) {}
// rdc_status_t rdc_policy_set(
// rdc_handle_t p_rdc_handle,
// rdc_gpu_group_t group_id,
// rdc_policy_t policy);
rpc SetPolicy(SetPolicyRequest) returns (SetPolicyResponse) {}
// rdc_status_t rdc_policy_get(
// rdc_handle_t p_rdc_handle,
// rdc_gpu_group_t group_id,
// uint32_t* count,
// rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);
rpc GetPolicy(GetPolicyRequest) returns (GetPolicyResponse) {}
// rdc_status_t rdc_policy_delete(
// rdc_handle_t p_rdc_handle,
// rdc_gpu_group_t group_id,
// rdc_policy_condition_type_t condition_type);
rpc DeletePolicy(DeletePolicyRequest) returns (DeletePolicyResponse) {}
// rdc_status_t rdc_policy_register(
// rdc_handle_t p_rdc_handle,
// rdc_gpu_group_t group_id,
// rdc_policy_condition_t condition,
// rdc_policy_register_callback callback);
rpc RegisterPolicy(RegisterPolicyRequest) returns (stream RegisterPolicyResponse) {}
// rdc_status_t rdc_policy_unregister(
// rdc_handle_t p_rdc_handle,
// rdc_gpu_group_t group_id,
// rdc_policy_condition_t condition);
rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {}
}
message Empty {
@@ -485,4 +519,100 @@ message GetMixedComponentVersionRequest {
message GetMixedComponentVersionResponse {
uint32 status = 1;
string version = 2;
}
}
message PolicyCondition{
enum Type {
COND_MAX_PAGE_RETRIED = 0;
COND_THERMAL = 1;
COND_POWER = 2;
};
Type type = 1;
int64 value = 2;
}
message Policy{
PolicyCondition condition = 1;
enum Action {
ACTION_NONE = 0;
ACTION_GPU_RESET = 1;
};
Action action = 2;
}
message SetPolicyResult {
uint32 status = 1;
}
message SetPolicyRequest {
uint32 group_id = 1;
Policy policy =2;
}
message SetPolicyResponse {
uint32 status = 1;
}
message PolicyResponse {
uint32 count= 1;
repeated Policy policies = 2;
}
message GetPolicyRequest {
uint32 group_id = 1;
}
message GetPolicyResponse {
uint32 status = 1;
PolicyResponse response = 2;
}
message DeletePolicyRequest {
uint32 group_id = 1;
enum PolicyConditionType{
RDC_POLICY_COND_MAX_PAGE_RETRIED = 0;
RDC_POLICY_COND_THERMAL = 1;
RDC_POLICY_COND_POWER = 2;
};
PolicyConditionType condition_type = 2;
}
message DeletePolicyResponse {
uint32 status = 1;
}
message RegisterPolicyResult {
uint32 status = 1;
}
message RegisterPolicyRequest {
uint32 group_id = 1;
}
message RegisterPolicyResponse {
uint32 status = 1;
uint32 version =2;
PolicyCondition condition =3;
uint32 group_id =4;
uint64 value=5;
}
message UnRegisterPolicyResult {
uint32 status = 1;
}
message UnRegisterPolicyRequest {
uint32 group_id = 1;
}
message UnRegisterPolicyResponse {
uint32 status = 1;
}
+46
View File
@@ -409,3 +409,49 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) {
dest[n - 1] = '\0';
return dest;
}
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_t policy) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_set(group_id, policy);
}
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_get(group_id, count, policies);
}
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type){
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_delete(group_id, condition_type);
}
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_register(group_id, callback);
}
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_unregister(group_id);
}
+1
View File
@@ -18,6 +18,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
"${SRC_DIR}/RdcModuleMgrImpl.cc"
"${SRC_DIR}/RdcNotificationImpl.cc"
"${SRC_DIR}/RdcPerfTimer.cc"
"${SRC_DIR}/RdcPolicyImpl.cc"
"${SRC_DIR}/RdcRocpLib.cc"
"${SRC_DIR}/RdcRocrLib.cc"
"${SRC_DIR}/RdcRVSLib.cc"
+31 -1
View File
@@ -34,6 +34,7 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
#include "rdc_lib/impl/RdcNotificationImpl.h"
#include "rdc_lib/impl/RdcPolicyImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include "rdc_lib/rdc_common.h"
@@ -78,7 +79,8 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
rdc_notif_(new RdcNotificationImpl()),
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)),
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) {
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)),
policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO");
metrics_updater_->start();
@@ -429,5 +431,33 @@ rdc_status_t RdcEmbeddedHandler::get_mixed_component_version(mixed_component_t c
return RDC_ST_OK;
}
// Policy API
rdc_status_t RdcEmbeddedHandler::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) {
return policy_->rdc_policy_set(group_id, policy);
}
rdc_status_t RdcEmbeddedHandler::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
if (count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return policy_->rdc_policy_get(group_id, count, policies);
}
rdc_status_t RdcEmbeddedHandler::rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) {
return policy_->rdc_policy_delete(group_id, condition_type);
}
rdc_status_t RdcEmbeddedHandler::rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) {
return policy_->rdc_policy_register(group_id, callback);
}
rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) {
return policy_->rdc_policy_unregister(group_id);
}
} // namespace rdc
} // namespace amd
+9
View File
@@ -664,6 +664,15 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
break;
}
case RDC_FI_GPU_PAGE_RETRIED:
uint32_t num_pages;
amdsmi_retired_page_record_t info;
value->status = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, &info);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = num_pages;
}
break;
case RDC_FI_OAM_ID: {
amdsmi_asic_info_t asic_info;
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
+258
View File
@@ -0,0 +1,258 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcPolicyImpl.h"
#include <sys/time.h>
#include <unistd.h>
#include <algorithm>
#include <ctime>
#include <map>
#include <sstream>
#include <unordered_map>
#include "common/rdc_utils.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcPolicyImpl::RdcPolicyImpl(const RdcGroupSettingsPtr& group_settings,
const RdcMetricFetcherPtr& metric_fetcher)
: group_settings_(group_settings), metric_fetcher_(metric_fetcher), start_(true) {
thread_ = std::thread([this]() {
while (start_) {
rdc_policy_check_condition();
usleep(500); // 500ms
}
});
}
RdcPolicyImpl::~RdcPolicyImpl() {
start_ = false;
thread_.join();
}
rdc_status_t RdcPolicyImpl::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) {
rdc_status_t status = RDC_ST_NOT_SUPPORTED;
std::lock_guard<std::mutex> guard(policy_mutex_);
// parameters check
if (policy.condition.type >= RDC_POLICY_COND_MAX) {
status = RDC_ST_BAD_PARAMETER;
return status;
}
if (policy.action > RDC_POLICY_ACTION_GPU_RESET) {
status = RDC_ST_BAD_PARAMETER;
return status;
}
// check if support RDC_POLICY_COND_MAX_PAGE_RETRIED
if (RDC_POLICY_COND_MAX_PAGE_RETRIED == policy.condition.type) {
uint32_t gpu_index;
rdc_group_info_t group_info;
rdc_field_value value;
status = group_settings_->rdc_group_gpu_get_info(group_id, &group_info);
for (unsigned int i = 0; i < group_info.count; i++) {
gpu_index = group_info.entity_ids[i];
status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_GPU_PAGE_RETRIED, &value);
if (status == RDC_ST_MSI_ERROR)
return RDC_ST_NOT_SUPPORTED;
}
}
auto it = settings_.find(group_id);
if (it != settings_.end()) {
std::vector<rdc_policy_t>& policies = it->second;
bool exist = false;
for (auto& itpolicy : policies) {
// if exist, overwrite the value and action
if (itpolicy.condition.type == policy.condition.type) {
itpolicy.condition.value = policy.condition.value;
itpolicy.action = policy.action;
exist = true;
}
}
if (!exist) {
policies.push_back(policy);
}
status = RDC_ST_OK;
} else {
std::vector<rdc_policy_t> policies;
policies.push_back(policy);
settings_.insert(std::make_pair(group_id, policies));
status = RDC_ST_OK;
}
return status;
}
rdc_status_t RdcPolicyImpl::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
rdc_status_t status = RDC_ST_NOT_SUPPORTED;
std::lock_guard<std::mutex> guard(policy_mutex_);
auto it = settings_.find(group_id);
if (it != settings_.end()) {
std::vector<rdc_policy_t>& policies_ref = it->second;
uint32_t i = 0;
for (auto itpolicy : policies_ref) {
// if exist
policies[i].condition.type = itpolicy.condition.type;
policies[i].condition.value = itpolicy.condition.value;
policies[i].action = itpolicy.action;
++i;
}
*count = i;
status = RDC_ST_OK;
} else {
status = RDC_ST_NOT_FOUND;
}
return status;
}
rdc_status_t RdcPolicyImpl::rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) {
rdc_status_t status = RDC_ST_NOT_FOUND;
std::lock_guard<std::mutex> guard(policy_mutex_);
auto it = settings_.find(group_id);
if (it != settings_.end()) {
std::vector<rdc_policy_t>& policies_ref = it->second;
auto itpolicy = policies_ref.begin();
while (itpolicy != policies_ref.end()) {
if (itpolicy->condition.type == condition_type) {
status = RDC_ST_OK;
itpolicy = policies_ref.erase(itpolicy);
} else {
++itpolicy;
}
}
} else {
status = RDC_ST_NOT_FOUND;
}
return status;
}
rdc_status_t RdcPolicyImpl::rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback)
{
rdc_status_t status = RDC_ST_NOT_FOUND;
std::lock_guard<std::mutex> guard(policy_mutex_);
auto run = callbacks_.find(group_id);
if (run != callbacks_.end()) {
run->second = callback;
status = RDC_ST_OK;
} else {
callbacks_.insert(std::make_pair(group_id, callback));
status = RDC_ST_OK;
}
return status;
}
rdc_status_t RdcPolicyImpl::rdc_policy_unregister(rdc_gpu_group_t group_id) {
rdc_status_t status = RDC_ST_OK;
std::lock_guard<std::mutex> guard(policy_mutex_);
callbacks_.erase(group_id);
return status;
}
void RdcPolicyImpl::rdc_policy_check_condition() {
// go through the settings
for (auto it : settings_) {
rdc_gpu_group_t group_id = it.first;
std::vector<rdc_policy_t> policies = it.second;
rdc_policy_register_callback callback = rdc_policy_get_callback(group_id);
for (auto policy : policies) {
rdc_status_t status;
rdc_field_value value;
rdc_group_info_t group_info;
uint32_t gpu_index;
status = group_settings_->rdc_group_gpu_get_info(group_id, &group_info);
for (unsigned int i = 0; i < group_info.count; i++) {
rdc_field_t map[RDC_POLICY_COND_MAX] = {RDC_FI_GPU_PAGE_RETRIED, RDC_FI_GPU_TEMP,
RDC_FI_POWER_USAGE};
gpu_index = group_info.entity_ids[i];
status = metric_fetcher_->fetch_smi_field(gpu_index, map[policy.condition.type], &value);
if (status == RDC_ST_OK) {
if (value.value.l_int > policy.condition.value) {
if (RDC_POLICY_ACTION_GPU_RESET == policy.action) {
rdc_policy_gpu_reset(gpu_index);
}
// callback if needed
if (callback) {
rdc_policy_callback_response_t response = {1, policy.condition, group_id,
value.value.l_int};
callback(&response);
}
}
}
}
}
}
}
rdc_policy_register_callback RdcPolicyImpl::rdc_policy_get_callback(rdc_gpu_group_t group_id) {
rdc_policy_register_callback cb = nullptr;
auto it = callbacks_.find(group_id);
if (it != callbacks_.end()) {
cb = it->second;
}
return cb;
}
void RdcPolicyImpl::rdc_policy_gpu_reset(uint32_t gpu_index) {
amdsmi_processor_handle processor_handle = {};
amdsmi_status_t err = get_processor_handle_from_id(gpu_index, &processor_handle);
if (err == AMDSMI_STATUS_SUCCESS) {
amdsmi_reset_gpu(processor_handle);
}
}
} // namespace rdc
} // namespace amd
+152 -4
View File
@@ -23,6 +23,8 @@ THE SOFTWARE.
#include <grpcpp/grpcpp.h>
#include <future>
#include "rdc.grpc.pb.h" // NOLINT
amd::rdc::RdcHandler* make_handler(const char* ip_and_port, const char* root_ca,
@@ -243,8 +245,8 @@ rdc_status_t RdcStandaloneHandler::rdc_device_get_attributes(uint32_t gpu_index,
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version(
rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
if (!p_rdc_compv) {
return RDC_ST_BAD_PARAMETER;
}
@@ -663,8 +665,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_update_all(uint32_t wait_for_update
}
// It is only an interface for the client under the GRPC framework and is not used as an RDC API.
rdc_status_t RdcStandaloneHandler::get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
rdc_status_t RdcStandaloneHandler::get_mixed_component_version(
mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
if (!p_mixed_compv) {
return RDC_ST_BAD_PARAMETER;
}
@@ -681,7 +683,153 @@ rdc_status_t RdcStandaloneHandler::get_mixed_component_version(mixed_component_t
strncpy_with_null(p_mixed_compv->version, reply.version().c_str(), USR_MAX_VERSION_STR_LENGTH);
return RDC_ST_OK;
}
// Policy RdcAPI
rdc_status_t RdcStandaloneHandler::rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) {
::rdc::SetPolicyRequest request;
::rdc::SetPolicyResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
auto to_policy = request.mutable_policy();
to_policy->set_action(static_cast<::rdc::Policy_Action>(policy.action));
auto to_condition = to_policy->mutable_condition();
to_condition->set_type(static_cast<::rdc::PolicyCondition_Type>(policy.condition.type));
to_condition->set_value(policy.condition.value);
// call gRPC
::grpc::Status status = stub_->SetPolicy(&context, request, &reply);
return error_handle(status, reply.status());
}
rdc_status_t RdcStandaloneHandler::rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) {
::rdc::GetPolicyRequest request;
::rdc::GetPolicyResponse reply;
::grpc::ClientContext context;
if (count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
request.set_group_id(group_id);
// call gRPC
::grpc::Status status = stub_->GetPolicy(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
auto response = reply.response();
uint32_t policy_count = response.count();
for (uint32_t i = 0; i < policy_count; ++i) {
const ::rdc::Policy& policy = response.policies(i);
::rdc::PolicyCondition cond = policy.condition();
policies[i].condition.type = static_cast<rdc_policy_condition_type_t>(cond.type());
policies[i].condition.value = cond.value();
policies[i].action = static_cast<rdc_policy_action_t>(policy.action());
}
*count = policy_count;
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) {
::rdc::DeletePolicyRequest request;
::rdc::DeletePolicyResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
request.set_condition_type(
static_cast<::rdc::DeletePolicyRequest_PolicyConditionType>(condition_type));
// call gRPC
::grpc::Status status = stub_->DeletePolicy(&context, request, &reply);
return error_handle(status, reply.status());
}
rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) {
// check if a thread for a group is already registered
auto it = policy_threads_.find(group_id);
if (it != policy_threads_.end()) {
return RDC_ST_CONFLICT;
}
// no registered callback, start the thread to read the stream from rdcd
struct policy_thread_context ctx = {true,nullptr};
ctx.t = new std::thread([this, group_id, callback]() {
// call rdcd
::rdc::RegisterPolicyRequest request;
::rdc::RegisterPolicyResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
// call to gRPC
std::unique_ptr<grpc::ClientReader<::rdc::RegisterPolicyResponse>> reader(
stub_->RegisterPolicy(&context, request));
bool start = true;
while (start) {
auto it = policy_threads_.find(group_id);
if (it != policy_threads_.end()) {
if (it->second.start == false) start = false;
} else {
start = false;
}
if (reader->Read(&reply)) {
reply.status();
::rdc::PolicyCondition cond = reply.condition();
rdc_policy_callback_response_t response;
response.version = reply.version();
response.condition.type = static_cast<rdc_policy_condition_type_t>(cond.type());
response.condition.value = cond.value();
response.group_id = reply.group_id();
response.value = reply.value();
callback(&response);
}
}
reader->Finish();
});
policy_threads_.insert(std::make_pair(group_id, ctx));
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) {
::rdc::UnRegisterPolicyRequest request;
::rdc::UnRegisterPolicyResponse reply;
::grpc::ClientContext context;
// stop the assocaticted thread of a group
auto it = policy_threads_.find(group_id);
if (it != policy_threads_.end()) {
struct policy_thread_context& ctx = it->second;
ctx.start = false;
}
// construcut the request
request.set_group_id(group_id);
// call gRPC
::grpc::Status status = stub_->UnRegisterPolicy(&context, request, &reply);
return error_handle(status, reply.status());
}
} // namespace rdc
+38 -4
View File
@@ -22,6 +22,8 @@ THE SOFTWARE.
#ifndef SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_
#define SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_
#include <thread>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc.h"
@@ -34,6 +36,7 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
~RdcAPIServiceImpl();
rdc_status_t Initialize(uint64_t rdcd_init_flags = 0);
void Shutdown();
::grpc::Status GetAllDevices(::grpc::ServerContext* context, const ::rdc::Empty* request,
::rdc::GetAllDevicesResponse* reply) override;
@@ -42,8 +45,9 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::GetDeviceAttributesRequest* request,
::rdc::GetDeviceAttributesResponse* reply) override;
::grpc::Status GetComponentVersion(::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request,
::rdc::GetComponentVersionResponse* reply) override;
::grpc::Status GetComponentVersion(::grpc::ServerContext* context,
const ::rdc::GetComponentVersionRequest* request,
::rdc::GetComponentVersionResponse* reply) override;
::grpc::Status CreateGpuGroup(::grpc::ServerContext* context,
const ::rdc::CreateGpuGroupRequest* request,
@@ -125,11 +129,41 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::DiagnosticTestCaseRunRequest* request,
::rdc::DiagnosticTestCaseRunResponse* reply) override;
::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context, const ::rdc::GetMixedComponentVersionRequest* request,
::rdc::GetMixedComponentVersionResponse* reply) override;
::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context,
const ::rdc::GetMixedComponentVersionRequest* request,
::rdc::GetMixedComponentVersionResponse* reply) override;
::grpc::Status SetPolicy(::grpc::ServerContext* context, const ::rdc::SetPolicyRequest* request,
::rdc::SetPolicyResponse* reply) override;
::grpc::Status GetPolicy(::grpc::ServerContext* context, const ::rdc::GetPolicyRequest* request,
::rdc::GetPolicyResponse* reply) override;
::grpc::Status DeletePolicy(::grpc::ServerContext* context,
const ::rdc::DeletePolicyRequest* request,
::rdc::DeletePolicyResponse* reply) override;
::grpc::Status RegisterPolicy(
::grpc::ServerContext* context, const ::rdc::RegisterPolicyRequest* request,
::grpc::ServerWriter< ::rdc::RegisterPolicyResponse>* stream) override;
::grpc::Status UnRegisterPolicy(::grpc::ServerContext* context,
const ::rdc::UnRegisterPolicyRequest* request,
::rdc::UnRegisterPolicyResponse* reply) override;
private:
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
rdc_handle_t rdc_handle_;
struct policy_thread_context {
pthread_mutex_t mutex;
pthread_cond_t cond;
rdc_policy_callback_response_t response;
bool start;
};
// map for group_id and thread context
static std::map<uint32_t, struct policy_thread_context*> policy_threads_;
static int PolicyCallback(rdc_policy_callback_response_t* userData);
};
} // namespace rdc
+196 -8
View File
@@ -20,11 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_api_service.h"
#include "rdc/rdc_server_main.h"
#include <assert.h>
#include <grpcpp/grpcpp.h>
#include <csignal>
#include <future>
#include <iostream>
#include <memory>
#include <string>
@@ -32,12 +33,15 @@ THE SOFTWARE.
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc.h"
#include "rdc/rdc_private.h"
#include "rdc/rdc_server_main.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
std::map<uint32_t, RdcAPIServiceImpl::policy_thread_context*> RdcAPIServiceImpl::policy_threads_;
RdcAPIServiceImpl::RdcAPIServiceImpl() : rdc_handle_(nullptr) {}
rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) {
@@ -55,6 +59,14 @@ rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) {
return result;
}
void RdcAPIServiceImpl::Shutdown() {
// exit policy threads
for (auto it : policy_threads_) {
policy_thread_context* ctx = it.second;
ctx->start = false;
}
}
RdcAPIServiceImpl::~RdcAPIServiceImpl() {
if (rdc_handle_) {
rdc_stop_embedded(rdc_handle_);
@@ -104,9 +116,9 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() {
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetComponentVersion(::grpc::ServerContext* context,
const ::rdc::GetComponentVersionRequest* request,
::rdc::GetComponentVersionResponse* reply) {
::grpc::Status RdcAPIServiceImpl::GetComponentVersion(
::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request,
::rdc::GetComponentVersionResponse* reply) {
(void)(context);
if (!reply) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply");
@@ -661,9 +673,9 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetMixedComponentVersion(::grpc::ServerContext* context,
const ::rdc::GetMixedComponentVersionRequest* request,
::rdc::GetMixedComponentVersionResponse* reply) {
::grpc::Status RdcAPIServiceImpl::GetMixedComponentVersion(
::grpc::ServerContext* context, const ::rdc::GetMixedComponentVersionRequest* request,
::rdc::GetMixedComponentVersionResponse* reply) {
(void)(context);
if (!reply) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply");
@@ -687,9 +699,185 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
reply->set_status(RDC_ST_OK);
return ::grpc::Status::OK;
} else {
return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT, "The provided request parameters are invalid");
return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
"The provided request parameters are invalid");
}
}
::grpc::Status RdcAPIServiceImpl::SetPolicy(::grpc::ServerContext* context,
const ::rdc::SetPolicyRequest* request,
::rdc::SetPolicyResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_policy_t policy;
// constructure the policy
::rdc::Policy p = request->policy();
::rdc::PolicyCondition cond = p.condition();
policy.condition.type = static_cast<rdc_policy_condition_type_t>(cond.type());
policy.condition.value = cond.value();
policy.action = static_cast<rdc_policy_action_t>(p.action());
// call RDC Policy API
rdc_status_t result = rdc_policy_set(rdc_handle_, request->group_id(), policy);
// set status
reply->set_status(result);
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetPolicy(::grpc::ServerContext* context,
const ::rdc::GetPolicyRequest* request,
::rdc::GetPolicyResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
uint32_t count = 0;
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS];
// call RDC Policy API
rdc_status_t result = rdc_policy_get(rdc_handle_, request->group_id(), &count, policies);
// set status
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::PolicyResponse* to_response = reply->mutable_response();
to_response->set_count(count);
for (uint32_t i = 0; i < count; i++) {
const rdc_policy_t& policy_ref = policies[i];
::rdc::Policy* policy = to_response->add_policies();
policy->set_action(static_cast<::rdc::Policy_Action>(policy_ref.action));
auto to_conditon = policy->mutable_condition();
to_conditon->set_type(static_cast<::rdc::PolicyCondition_Type>(policy_ref.condition.type));
to_conditon->set_value(policy_ref.condition.value);
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::DeletePolicy(::grpc::ServerContext* context,
const ::rdc::DeletePolicyRequest* request,
::rdc::DeletePolicyResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
// call RDC Policy API
rdc_status_t result =
rdc_policy_delete(rdc_handle_, request->group_id(),
static_cast<rdc_policy_condition_type_t>(request->condition_type()));
// set status
reply->set_status(result);
return ::grpc::Status::OK;
}
int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) {
if (userData == nullptr) {
std::cerr << "The rdc_policy_callback returns null data\n";
return 1;
}
auto it = policy_threads_.find(userData->group_id);
if (it != policy_threads_.end()) {
policy_thread_context* ctx = it->second;
pthread_mutex_lock(&ctx->mutex);
ctx->response = *userData;
pthread_cond_signal(&ctx->cond);
pthread_mutex_unlock(&ctx->mutex);
}
return 0;
}
::grpc::Status RdcAPIServiceImpl::RegisterPolicy(
::grpc::ServerContext* context, const ::rdc::RegisterPolicyRequest* request,
::grpc::ServerWriter<::rdc::RegisterPolicyResponse>* writer) {
(void)(context);
if (!writer || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
policy_thread_context* data = new policy_thread_context;
data->mutex = PTHREAD_MUTEX_INITIALIZER;
data->cond = PTHREAD_COND_INITIALIZER;
data->start = true;
policy_threads_.insert(std::make_pair(request->group_id(), data));
auto updater = std::async(std::launch::async, [this, request, writer]() {
rdc_status_t result = rdc_policy_register(rdc_handle_, request->group_id(), PolicyCallback);
if (result == RDC_ST_OK) {
auto it = policy_threads_.find(request->group_id());
if (it != policy_threads_.end()) {
policy_thread_context* ctx = it->second;
while (ctx->start) {
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
ts.tv_sec += 1;
pthread_mutex_lock(&ctx->mutex);
int rc = pthread_cond_timedwait(&ctx->cond, &ctx->mutex, &ts); // wait for the callback
if (rc == ETIMEDOUT) {
// timeout;
} else if (rc == 0) {
// reply
::rdc::RegisterPolicyResponse reply;
reply.set_status(RDC_ST_OK);
reply.set_version(ctx->response.version);
reply.set_group_id(ctx->response.group_id);
reply.set_value(ctx->response.value);
::rdc::PolicyCondition* cond = reply.mutable_condition();
cond->set_type(static_cast<::rdc::PolicyCondition_Type>(ctx->response.condition.type));
cond->set_value(ctx->response.condition.value);
writer->Write(reply);
}
pthread_mutex_unlock(&ctx->mutex);
}
}
}
});
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::UnRegisterPolicy(::grpc::ServerContext* context,
const ::rdc::UnRegisterPolicyRequest* request,
::rdc::UnRegisterPolicyResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_status_t result = rdc_policy_unregister(rdc_handle_, request->group_id());
if (result == RDC_ST_OK) {
auto it = policy_threads_.find(request->group_id());
if (it != policy_threads_.end()) {
policy_thread_context* ctx = it->second;
ctx->start = false;
}
}
return ::grpc::Status::OK;
}
} // namespace rdc
} // namespace amd
+2
View File
@@ -271,6 +271,8 @@ static int FileOwner(const char* fn, std::string* owner) {
}
void RDCServer::ShutDown(void) {
api_service_->Shutdown();
server_->Shutdown();
if (rdc_admin_service_) {