Implementation for SWDEV-479728:[RDC] - Clock Speed/Power Cap Control

Change-Id: I767a71325527aa3c691e9607953ceafebacfb4d5
Signed-off-by: adapryor <Adam.pryor@amd.com>
Этот коммит содержится в:
Adam Pryor
2024-10-31 09:48:09 -05:00
коммит произвёл Pryor, Adam
родитель 7c91a07a43
Коммит df170c8801
20 изменённых файлов: 1549 добавлений и 117 удалений
+6
Просмотреть файл
@@ -126,6 +126,12 @@ set(HEALTH_EXAMPLE_EXE "health")
add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}")
target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap)
set(CONFIG_EXAMPLE_SRC_LIST "config_example.cc")
cmake_print_variables(CONFIG_EXAMPLE_SRC_LIST)
set(CONFIG_EXAMPLE_EXE "config")
add_executable(${CONFIG_EXAMPLE_EXE} "${CONFIG_EXAMPLE_SRC_LIST}")
target_link_libraries(${CONFIG_EXAMPLE_EXE} pthread dl rdc_bootstrap)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Example ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
+156
Просмотреть файл
@@ -0,0 +1,156 @@
#include <unistd.h>
#include <chrono>
#include <iostream>
#include <thread>
#include "rdc/rdc.h"
int main() {
rdc_gpu_group_t group_id;
rdc_status_t result;
bool standalone = false;
rdc_handle_t rdc_handle;
uint32_t count = 0;
rdc_config_setting_list_t settings_list;
rdc_config_setting_t setting;
uint64_t watts;
char hostIpAddress[] = {"localhost:50051"};
char group_name[] = {"group1"};
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");
// Init the rdc
result = rdc_init(0);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
}
// Now we can use the same API for both standalone and embedded
// Get the list of devices in the system
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
result = rdc_device_get_all(rdc_handle, gpu_index_list, &count);
if (result != RDC_ST_OK) {
std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result);
goto cleanup;
}
if (count == 0) {
std::cout << "No GPUs find on the sytem ";
goto cleanup;
} else {
std::cout << count << " GPUs found in the system.\n";
}
// Create the group
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Created the GPU group " << group_id << std::endl;
// Add all GPUs to the group
for (uint32_t i = 0; i < count; i++) {
result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: " << rdc_status_string(result);
goto cleanup;
}
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute);
if (result != RDC_ST_OK) {
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group "
<< group_id << std::endl;
}
setting.type = RDC_CFG_POWER_LIMIT;
// Our targeted value is 195 Watts, which will be converted into Microwatts inside of
// rdc_config_set
setting.target_value = 195;
result = rdc_config_set(rdc_handle, group_id, setting);
if (result != RDC_ST_OK) {
std::cout << "Error set config RDC_CFG_POWER_LIMIT, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
result = rdc_config_get(rdc_handle, group_id, &settings_list);
if (result != RDC_ST_OK) {
std::cout << "Error get config, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
// Prompt user to change amd-smi to other value, and watch rdc config change it back
std::cout << "Config before wait:" << std::endl;
result = rdc_config_get(rdc_handle, group_id, &settings_list);
if (result != RDC_ST_OK) {
std::cout << "Error get config, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
std::cout << "The config will keep the power limit to 195 Watts" << std::endl;
std::cout << "You can change the power limit using amd-smi, the RDC config module should be able "
"to detect it and set it back"
<< std::endl;
std::cout << "Waiting 3 minutes before exit ..." << std::endl;
std::this_thread::sleep_for(std::chrono::minutes(3));
result = rdc_config_clear(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error clear config, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
//... clean up
cleanup:
std::cout << "Cleaning up.\n";
result = rdc_group_gpu_destroy(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error delete GPU group. Return: " << rdc_status_string(result);
}
std::cout << "Deleted the GPU group " << group_id << std::endl;
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
}
+119 -45
Просмотреть файл
@@ -147,6 +147,11 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
*/
#define RDC_MAX_VERSION_STR_LENGTH 60
/**
* @brief Max configuration can be collected using the configuration get
*/
#define RDC_MAX_CONFIG_SETTINGS 32
/**
* These enums are used to specify a particular field to be retrieved.
*/
@@ -182,11 +187,11 @@ typedef enum {
/**
* @brief GPU usage related fields
*/
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance
RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance
RDC_FI_GPU_MM_ENC_UTIL, //!< Multimedia encoder busy percentage
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance
RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance
RDC_FI_GPU_MM_ENC_UTIL, //!< Multimedia encoder busy percentage
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage
/**
@@ -251,16 +256,16 @@ typedef enum {
RDC_FI_XGMI_6_READ_KB, //!< XGMI_6 accumulated data read size (KB)
RDC_FI_XGMI_7_READ_KB, //!< XGMI_7 accumulated data read size (KB)
RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB)
RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB)
RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB)
RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB)
RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB)
RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB)
RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB)
RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB)
RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB)
RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB)
RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB)
RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB)
RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB)
RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB)
RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB)
RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB)
RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB)
RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB)
RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB)
RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB)
/**
* @brief ROC-profiler related fields
@@ -340,14 +345,14 @@ typedef enum {
/**
* @brief RDC health related fields
*/
RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,//!< The threshold of uncorrectable page
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, //!< The threshold of uncorrectable page
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
} rdc_field_t;
// even and odd numbers are used for correctable and uncorrectable errors
@@ -517,7 +522,7 @@ typedef enum {
*/
typedef enum {
RDC_AMDMSI_COMPONENT
//If needed later, add them one by one
// If needed later, add them one by one
} rdc_component_t;
/**
@@ -571,8 +576,8 @@ typedef struct {
typedef void (*rdc_callback_t)(void*, void*);
typedef struct {
rdc_callback_t callback; //!< Callback sends logs for running diagnostics
void* cookie; //!< Cookie is used to identify different callbacks and supply them with data
rdc_callback_t callback; //!< Callback sends logs for running diagnostics
void* cookie; //!< Cookie is used to identify different callbacks and supply them with data
} rdc_diag_callback_t;
/**
@@ -599,7 +604,7 @@ typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_
*/
typedef struct {
rdc_policy_condition_t condition; //!< condition to meet
rdc_policy_action_t action; //!< Action to take
rdc_policy_action_t action; //!< Action to take
} rdc_policy_t;
typedef enum {
@@ -663,12 +668,12 @@ typedef struct {
* @brief type of health watches
*/
typedef enum {
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
} rdc_health_system_t;
/**
@@ -708,32 +713,56 @@ typedef enum {
* @brief details of the health errors
*/
typedef struct {
char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details
uint32_t code; //!< The low level error code
char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details
uint32_t code; //!< The low level error code
} rdc_health_detail_t;
/**
* @brief details of the per health incidents
*/
typedef struct {
uint32_t gpu_index; //!< which GPU in this group have the issue
rdc_health_system_t component; //!< which components have the issue
rdc_health_result_t health; //!< health diagnosis of this incident
rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t
uint32_t gpu_index; //!< which GPU in this group have the issue
rdc_health_system_t component; //!< which components have the issue
rdc_health_result_t health; //!< health diagnosis of this incident
rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t
} rdc_health_incidents_t;
#define HEALTH_MAX_ERROR_ITEMS 64
/**
* @brief The health responses for test cases
*/
typedef struct {
rdc_health_result_t overall_health; //!< The overall health of this entire host
unsigned int incidents_count; //!< The number of health incidents reported in this struct
rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected
rdc_health_result_t overall_health; //!< The overall health of this entire host
unsigned int incidents_count; //!< The number of health incidents reported in this struct
rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected
} rdc_health_response_t;
/**
* @brief property id's for the configuration set/get
*/
typedef enum {
RDC_CFG_GFX_CLOCK_LIMIT,
RDC_CFG_MEMORY_CLOCK_LIMIT,
RDC_CFG_POWER_LIMIT
} rdc_config_type_t;
/**
* @brief Value mapped to rdc_config_type_t property id for the configuration set/get
*/
typedef struct {
rdc_config_type_t type;
uint64_t target_value;
} rdc_config_setting_t;
/**
* @brief Array of properties collected using the configuration get
*/
typedef struct {
uint32_t total_settings;
rdc_config_setting_t settings[RDC_MAX_CONFIG_SETTINGS];
} rdc_config_setting_list_t;
/**
* @brief Initialize ROCm RDC.
*
@@ -972,7 +1001,8 @@ rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_i
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t* p_rdc_compv);
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component,
rdc_component_version_t* p_rdc_compv);
/**
* @brief Create a group contains multiple GPUs
@@ -1382,7 +1412,7 @@ typedef struct {
unsigned int version;
rdc_policy_condition_t condition; //!< the condition that is meet
rdc_gpu_group_t group_id; //!< The group id trigger this callback
int64_t value; //!< The current value that meet the condition
int64_t value; //!< The current value that meet the condition
} rdc_policy_callback_response_t;
/**
@@ -1514,6 +1544,50 @@ rdc_status_t rdc_device_topology_get(rdc_handle_t p_rdc_handle, uint32_t gpu_ind
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* results);
/**
* @brief Set one configuration
*
* @details Set the given configuration to all nodes belong to the given group
*
* @param[in] p_rdc_handle Node handle
*
* @param[in] group_id Group id to which node belongs
*
* @param[in] setting Configuration to be set for the nodes
*
* @retval RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_config_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_config_setting_t setting);
/**
* @brief Get the configrations
*
* @details Get all the configurations for all nodes belong to the given group
*
* @param[in] p_rdc_handle Node handle
*
* @param[in] group_id Group id to which nodes belong
*
* @param[out] settings List of configurations returned.
*
* @retval RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings);
/**
* @brief Clear the setting
*
* @details Clear all the configurations for the nodes belongs to the given group
*
* @param[in] p_rdc_handle Node handle
*
* @param[in] group_id Group id to which nodes belong
*
* @retval RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
#ifdef __cplusplus
}
+50
Просмотреть файл
@@ -0,0 +1,50 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_RDCCONFIGSETTINGS_H_
#define INCLUDE_RDC_LIB_RDCCONFIGSETTINGS_H_
#include <memory.h>
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
class RdcConfigSettings {
public:
// Set one configure
virtual rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) = 0;
// Get the setting
virtual rdc_status_t rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) = 0;
// Clear the setting
virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0;
virtual ~RdcConfigSettings() {}
};
typedef std::shared_ptr<RdcConfigSettings> RdcConfigSettingsPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCCONFIGSETTINGS_H_
+18 -4
Просмотреть файл
@@ -81,18 +81,21 @@ class RdcHandler {
// Diagnostic API
virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response, rdc_diag_callback_t* callback) = 0;
rdc_diag_response_t* response,
rdc_diag_callback_t* callback) = 0;
virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) = 0;
rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) = 0;
// Control API
virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0;
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// The reason is that RdcEmbeddedHandler::get_mixed_component_version does not need to be called.
virtual rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) = 0;
virtual rdc_status_t get_mixed_component_version(mixed_component_t component,
mixed_component_version_t* p_mixed_compv) = 0;
// Policy API
virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0;
@@ -110,13 +113,24 @@ class RdcHandler {
// Health API
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) = 0;
virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) = 0;
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) = 0;
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t* response) = 0;
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
// topology API
virtual rdc_status_t rdc_device_topology_get(uint32_t gpu_index,
rdc_device_topology_t* results) = 0;
virtual rdc_status_t rdc_link_status_get(rdc_link_status_t* results) = 0;
// Set one configure
virtual rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) = 0;
// Get the setting
virtual rdc_status_t rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) = 0;
// Clear the setting
virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0;
virtual ~RdcHandler() {}
};
+73
Просмотреть файл
@@ -0,0 +1,73 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_IMPL_RDCCONFIGSETTINGSIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCCONFIGSETTINGSIMPL_H_
#include <atomic>
#include <condition_variable>
#include <mutex> // NOLINT
#include <thread>
#include <unordered_map>
#include "rdc_lib/RdcConfigSettings.h"
#include "rdc_lib/RdcGroupSettings.h"
namespace amd {
namespace rdc {
class RdcConfigSettingsImpl : public RdcConfigSettings {
public:
// Set one configure
rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) override;
// Get the setting
rdc_status_t rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) override;
// clear the setting
rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override;
explicit RdcConfigSettingsImpl(const RdcGroupSettingsPtr& group_settings);
private:
RdcGroupSettingsPtr group_settings_;
std::unordered_map<rdc_gpu_group_t, std::unordered_map<rdc_config_type_t, rdc_config_setting_t>>
cached_group_settings_;
std::thread monitor_thread_;
std::mutex mutex_; // Mutex for cached_group_settings_
std::atomic<bool> is_running_; // Bool for if the thread should keep running
std::condition_variable cv_;
// monitorSettings() is kicked off from the RdcConfigSettingsImpl constructor as it's own thread
// Every minute, it will check if gpu settings from amdsmi are the same as inside
// cached_group_settings_ If not, it sets the mismatched values to the value in
// cached_group_settings_
void monitorSettings();
uint64_t wattsToMicrowatts(uint64_t watts) const;
uint64_t microwattsToWatts(int microwatts) const;
rdc_status_t get_group_info(rdc_gpu_group_t group_id, rdc_group_info_t* rdc_group_info);
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCCONFIGSETTINGSIMPL_H_
+21 -5
Просмотреть файл
@@ -25,6 +25,7 @@ THE SOFTWARE.
#include <future> // NOLINT(build/c++11)
#include "rdc_lib/RdcCacheManager.h"
#include "rdc_lib/RdcConfigSettings.h"
#include "rdc_lib/RdcGroupSettings.h"
#include "rdc_lib/RdcHandler.h"
#include "rdc_lib/RdcMetricFetcher.h"
@@ -86,17 +87,20 @@ class RdcEmbeddedHandler final : public RdcHandler {
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_diag_response_t* response,
rdc_diag_callback_t* callback) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) override;
// Control API
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// Pure virtual functions need to be overridden.
rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override;
rdc_status_t get_mixed_component_version(mixed_component_t component,
mixed_component_version_t* p_mixed_compv) override;
// Policy API
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
@@ -106,19 +110,30 @@ class RdcEmbeddedHandler final : public RdcHandler {
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) override;
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, rdc_policy_register_callback callback) override;
rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) override;
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
// Health API
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override;
rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override;
// Set one configure
rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) override;
// Get the setting
rdc_status_t rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) override;
// Clear the setting
rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override;
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
~RdcEmbeddedHandler() final;
@@ -132,6 +147,7 @@ class RdcEmbeddedHandler final : public RdcHandler {
RdcWatchTablePtr watch_table_;
RdcMetricsUpdaterPtr metrics_updater_;
RdcPolicyPtr policy_;
RdcConfigSettingsPtr config_handler_;
std::future<void> updater_;
RdcTopologyLinkPtr topologylink_;
};
+21 -8
Просмотреть файл
@@ -23,8 +23,8 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_
#include <grpcpp/grpcpp.h>
#include <memory>
#include <future>
#include <memory>
#include <thread>
#include "rdc.grpc.pb.h" // NOLINT
@@ -82,21 +82,35 @@ class RdcStandaloneHandler : public RdcHandler {
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_diag_response_t* response,
rdc_diag_callback_t* callback) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) override;
// Control RdcAPI
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
// Set one configure
rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) override;
// Get the setting
rdc_status_t rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) override;
// Clear the setting
rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override;
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// Pure virtual functions need to be overridden
rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override;
rdc_status_t get_mixed_component_version(mixed_component_t component,
mixed_component_version_t* p_mixed_compv) override;
// Policy API
rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override;
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override;
rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) override;
@@ -109,7 +123,7 @@ class RdcStandaloneHandler : public RdcHandler {
// Health API
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override;
@@ -129,11 +143,10 @@ class RdcStandaloneHandler : public RdcHandler {
struct policy_thread_context {
bool start;
std::thread *t;
std::thread* t;
};
std::map<uint32_t, struct policy_thread_context> policy_threads_;
};
} // namespace rdc
+49
Просмотреть файл
@@ -207,6 +207,16 @@ service RdcAPI {
// rdc_gpu_group_t group_id,
// rdc_policy_condition_t condition);
rpc GetTopology(GetTopologyRequest) returns (GetTopologyResponse) {}
//Set one configure
rpc SetConfig(SetConfigRequest) returns (SetConfigResponse) {}
//Get the setting
rpc GetConfig(GetConfigRequest) returns (GetConfigResponse) {}
//Clear the setting
rpc ClearConfig(ClearConfigRequest) returns (ClearConfigResponse) {}
}
message Empty {
@@ -720,3 +730,42 @@ message GetTopologyResponse {
Topology toppology = 2;
}
enum rdc_config_type {
RDC_CFG_GFX_CLOCK_LIMIT = 0;
RDC_CFG_MEMORY_CLOCK_LIMIT = 1;
RDC_CFG_POWER_LIMIT = 2;
}
message rdc_config_setting {
rdc_config_type type = 1;
uint64 target_value = 2;
}
message SetConfigRequest {
uint32 group_id = 1;
rdc_config_setting setting = 2;
}
message SetConfigResponse {
uint32 status = 1;
}
message GetConfigRequest {
uint32 group_id = 1;
uint32 num_of_settings = 2;
}
message GetConfigResponse {
uint32 status = 1;
repeated rdc_config_setting settings = 2;
}
message ClearConfigRequest {
uint32 group_id = 1;
}
message ClearConfigResponse {
uint32 status = 1;
}
+44 -22
Просмотреть файл
@@ -183,7 +183,8 @@ rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_i
->rdc_device_get_attributes(gpu_index, p_rdc_attr);
}
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component,
rdc_component_version_t* p_rdc_compv) {
if (!p_rdc_handle || !p_rdc_compv) {
return RDC_ST_INVALID_HANDLER;
}
@@ -316,7 +317,8 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, const char* config,
size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
size_t config_size, rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
@@ -325,7 +327,8 @@ rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_
->rdc_test_case_run(group_id, test_case, config, config_size, result, callback);
}
rdc_status_t get_mixed_component_version(rdc_handle_t p_rdc_handle, mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
rdc_status_t get_mixed_component_version(rdc_handle_t p_rdc_handle, mixed_component_t component,
mixed_component_version_t* p_mixed_compv) {
if (!p_rdc_handle || !p_mixed_compv) {
return RDC_ST_INVALID_HANDLER;
}
@@ -388,6 +391,32 @@ const char* rdc_diagnostic_result_string(rdc_diag_result_t result) {
}
}
rdc_status_t rdc_config_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_config_setting_t setting) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_config_set(group_id, setting);
}
rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) {
if (!p_rdc_handle || settings == nullptr) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_config_get(group_id, settings);
}
rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_config_clear(group_id);
}
const char* field_id_string(rdc_field_t field_id) {
amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id();
return field_id_to_descript.find(field_id)->second.label.c_str();
@@ -407,8 +436,7 @@ rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_set(group_id, components);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_health_set(group_id, components);
}
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
@@ -417,18 +445,16 @@ rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_get(group_id, components);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_health_get(group_id, components);
}
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
rdc_health_response_t* response) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_check(group_id, response);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_health_check(group_id, response);
}
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
@@ -436,8 +462,7 @@ rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_i
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_clear(group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_health_clear(group_id);
}
char* strncpy_with_null(char* dest, const char* src, size_t n) {
@@ -464,35 +489,32 @@ rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_get(group_id, count, policies);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_get(group_id, count, policies);
}
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type){
rdc_policy_condition_type_t condition_type) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_delete(group_id, condition_type);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_delete(group_id, condition_type);
}
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_register(group_id, callback);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_register(group_id, callback);
}
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_unregister(group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_policy_unregister(group_id);
}
rdc_status_t rdc_device_topology_get(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_device_topology_t* results) {
+1
Просмотреть файл
@@ -25,6 +25,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
"${SRC_DIR}/RdcRVSLib.cc"
"${SRC_DIR}/RdcSmiDiagnosticImpl.cc"
"${SRC_DIR}/RdcSmiLib.cc"
"${SRC_DIR}/RdcConfigSettingsImpl.cc"
"${SRC_DIR}/RdcTelemetryModule.cc"
"${SRC_DIR}/RdcWatchTableImpl.cc"
"${SRC_DIR}/SmiUtils.cc")
+364
Просмотреть файл
@@ -0,0 +1,364 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcConfigSettingsImpl.h"
#include <chrono>
#include <ctime>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcConfigSettingsImpl::RdcConfigSettingsImpl(const RdcGroupSettingsPtr& group_settings)
: group_settings_(group_settings), is_running_(false) {}
// Monitoring thread of gpu settings
void RdcConfigSettingsImpl::monitorSettings() {
rdc_gpu_group_t group_id;
amdsmi_processor_handle processor_handle;
amdsmi_status_t status;
rdc_status_t rdc_status;
rdc_group_info_t rdc_group_info = {};
amdsmi_power_cap_info_t cap_info = {};
amdsmi_dev_perf_level_t perf_info = {};
uint32_t od;
uint64_t cached_value;
while (true) {
{ // Scope block for mutex
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait_for(lock, std::chrono::minutes(1),
[this] { return !is_running_ || cached_group_settings_.empty(); });
if (!is_running_ || cached_group_settings_.empty()) {
break; // Stop if the thread is requested to stop or settings are empty
}
for (const auto& group_pair : cached_group_settings_) {
group_id = group_pair.first;
const auto& cached_settings = group_pair.second;
rdc_status = get_group_info(group_id, &rdc_group_info);
if (rdc_status != RDC_ST_OK) {
// Error log handled in get_group_info
continue;
}
for (unsigned int i = 0; i < rdc_group_info.count; ++i) {
status = get_processor_handle_from_id(rdc_group_info.entity_ids[i], &processor_handle);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(): get_processor_handle_from_id faied: "
<< status);
continue;
}
// Power cap
status = amdsmi_get_power_cap_info(processor_handle, 0, &cap_info);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_power_cap_info failed: "
<< status);
continue;
}
auto power_cap_it = cached_settings.find(RDC_CFG_POWER_LIMIT);
if (power_cap_it != cached_settings.end()) {
cached_value = power_cap_it->second.target_value;
if (microwattsToWatts(cap_info.power_cap) != cached_value) {
RDC_LOG(
RDC_INFO,
"RdcConfigSettingsImpl::monitorSettings(); Mismatched Power values, resetting");
status = amdsmi_set_power_cap(processor_handle, 0, wattsToMicrowatts(cached_value));
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_set_power_cap_info failed: "
<< status);
continue;
}
}
}
// Mem clock
status = amdsmi_get_gpu_overdrive_level(processor_handle, &od);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_overdrive_level failed: "
<< status);
continue;
}
auto mem_clk_it = cached_settings.find(RDC_CFG_MEMORY_CLOCK_LIMIT);
if (mem_clk_it != cached_settings.end()) {
cached_value = mem_clk_it->second.target_value;
if (od == cached_value) {
status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM,
CLK_LIMIT_MAX, cached_value);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_set_gpu_clk_limit failed "
"for mem clk: "
<< status);
continue;
}
}
}
// GFX clock
status = amdsmi_get_gpu_perf_level(processor_handle, &perf_info);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_perf_level failed: "
<< status);
continue;
}
auto gfx_clk_it = cached_settings.find(RDC_CFG_GFX_CLOCK_LIMIT);
if (gfx_clk_it != cached_settings.end()) {
cached_value = gfx_clk_it->second.target_value;
if (od == cached_value) {
status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX,
CLK_LIMIT_MAX, cached_value);
if (status != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::monitorSettings(); amdsmi_set_gpu_clk_limit failed "
"for gfx clk: "
<< status);
continue;
}
}
}
}
}
}
}
RDC_LOG(RDC_INFO, "RdcConfigSettingsImpl Monitoring Thread Stopped");
}
uint64_t RdcConfigSettingsImpl::wattsToMicrowatts(uint64_t watts) const {
return watts * 1'000'000;
}
uint64_t RdcConfigSettingsImpl::microwattsToWatts(int microwatts) const {
return microwatts / 1'000'000;
}
rdc_status_t RdcConfigSettingsImpl::get_group_info(rdc_gpu_group_t group_id,
rdc_group_info_t* rdc_group_info) {
rdc_status_t status = group_settings_->rdc_group_gpu_get_info(group_id, rdc_group_info);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_set(): rdc_group_gpu_get_info failed : " << status);
}
return status;
}
// Set configuration setting
rdc_status_t RdcConfigSettingsImpl::rdc_config_set(rdc_gpu_group_t group_id,
rdc_config_setting_t setting) {
amdsmi_processor_handle processor_handle;
amdsmi_status_t amd_ret;
// Get the group info for gpu_index list
rdc_group_info_t rdc_group_info;
if (get_group_info(group_id, &rdc_group_info) != RDC_ST_OK) {
return RDC_ST_UNKNOWN_ERROR;
}
for (unsigned int i = 0; i < rdc_group_info.count; ++i) {
amd_ret = get_processor_handle_from_id(rdc_group_info.entity_ids[i], &processor_handle);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_set(): Failed to get processor handle : " << amd_ret);
break;
}
if (setting.type == RDC_CFG_POWER_LIMIT) {
amd_ret = amdsmi_set_power_cap(processor_handle, 0, wattsToMicrowatts(setting.target_value));
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_set: amdsmi_set_power_cap failed : " << amd_ret);
break;
}
} else if (setting.type == RDC_CFG_MEMORY_CLOCK_LIMIT) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX,
setting.target_value);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_set: amdsmi_set_gpu_clk_limit failed : " << amd_ret);
break;
}
} else if (setting.type == RDC_CFG_GFX_CLOCK_LIMIT) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX,
setting.target_value);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(
RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_set: amdsmi_set_gpu_clk_limit failed : " << amd_ret);
break;
}
}
}
if (amd_ret == AMDSMI_STATUS_SUCCESS) {
std::lock_guard<std::mutex> lock(mutex_);
cached_group_settings_[group_id][setting.type] = setting;
if (!is_running_) {
is_running_ = true;
monitor_thread_ = std::thread(&RdcConfigSettingsImpl::monitorSettings, this);
RDC_LOG(RDC_INFO, "RdcConfigSettingsImpl Monitoring Thread Started");
}
return RDC_ST_OK;
} else {
return RDC_ST_UNKNOWN_ERROR;
}
}
// Display user configured settings
rdc_status_t RdcConfigSettingsImpl::rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) {
// Ensure group_id exists in the cache
std::lock_guard<std::mutex> lock(mutex_);
auto group_iter = cached_group_settings_.find(group_id);
if (group_iter == cached_group_settings_.end()) {
RDC_LOG(RDC_ERROR, "rdc_config_get: group_id not found in cache: " << RDC_ST_NOT_FOUND);
return RDC_ST_NOT_FOUND;
}
// Iterate through cached settings for this group
int i = 0;
for (const auto& setting_pair : group_iter->second) {
if (i >= RDC_MAX_CONFIG_SETTINGS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_get: more settings than RDC_MAX_CONFIG_SETTINGS: "
<< RDC_ST_MAX_LIMIT);
return RDC_ST_MAX_LIMIT;
}
settings->settings[i].type = setting_pair.first;
settings->settings[i].target_value = setting_pair.second.target_value;
++i;
}
settings->total_settings = i;
return RDC_ST_OK;
}
// Clear cache of user configured settings
rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) {
amdsmi_status_t amd_ret = AMDSMI_STATUS_SUCCESS;
amdsmi_processor_handle processor_handle;
// Check if group_id has any cached settings
std::unique_lock<std::mutex> lock(mutex_);
auto group_iter = cached_group_settings_.find(group_id);
if (group_iter == cached_group_settings_.end()) {
// No cached settings for this group, nothing to clear
return RDC_ST_OK;
}
rdc_group_info_t rdc_group_info;
if (get_group_info(group_id, &rdc_group_info) != RDC_ST_OK) {
return RDC_ST_UNKNOWN_ERROR;
}
// Iterate over each GPU in the group and clear only the cached settings
for (unsigned int i = 0; i < rdc_group_info.count; ++i) {
amd_ret = get_processor_handle_from_id(rdc_group_info.entity_ids[i], &processor_handle);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear(): Failed to get processor handle : "
<< amd_ret);
break;
}
// Reset power cap if it was set
if (group_iter->second.find(RDC_CFG_POWER_LIMIT) != group_iter->second.end()) {
amdsmi_power_cap_info_t cap_info = {};
amd_ret = amdsmi_get_power_cap_info(processor_handle, 0, &cap_info);
if (amd_ret == AMDSMI_STATUS_SUCCESS && cap_info.power_cap != cap_info.default_power_cap) {
amd_ret = amdsmi_set_power_cap(processor_handle, 0, cap_info.default_power_cap);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset power cap : "
<< amd_ret);
break;
}
}
}
// Reset GFX clock limit if it was set
if (group_iter->second.find(RDC_CFG_GFX_CLOCK_LIMIT) != group_iter->second.end()) {
amdsmi_dev_perf_level_t perf_info = {};
amd_ret = amdsmi_get_gpu_perf_level(processor_handle, &perf_info);
if (amd_ret == AMDSMI_STATUS_SUCCESS && perf_info != AMDSMI_DEV_PERF_LEVEL_AUTO) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX,
AMDSMI_DEV_PERF_LEVEL_AUTO);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : "
<< amd_ret);
break;
}
}
}
// Reset memory clock limit if it was set
if (group_iter->second.find(RDC_CFG_MEMORY_CLOCK_LIMIT) != group_iter->second.end()) {
uint32_t od = 0;
amd_ret = amdsmi_get_gpu_overdrive_level(processor_handle, &od);
if (amd_ret == AMDSMI_STATUS_SUCCESS && od != 0) {
amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0);
if (amd_ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:"
<< amd_ret);
break;
}
}
}
}
cached_group_settings_.erase(group_id);
if (cached_group_settings_.empty()) {
is_running_ = false;
cv_.notify_all();
lock.unlock();
if (monitor_thread_.joinable()) {
monitor_thread_.join(); // Wait for the thread to finish
}
RDC_LOG(RDC_INFO, "RdcConfigSettingsImpl Monitoring Thread Stopped");
}
return (amd_ret == AMDSMI_STATUS_SUCCESS) ? RDC_ST_OK : RDC_ST_UNKNOWN_ERROR;
}
} // namespace rdc
} // namespace amd
+36 -13
Просмотреть файл
@@ -30,6 +30,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
#include "rdc_lib/impl/RdcConfigSettingsImpl.h"
#include "rdc_lib/impl/RdcGroupSettingsImpl.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
@@ -80,10 +81,12 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
metric_fetcher_(new RdcMetricFetcherImpl()),
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
rdc_notif_(new RdcNotificationImpl()),
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, rdc_module_mgr_, rdc_notif_)),
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_,
rdc_module_mgr_, rdc_notif_)),
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)),
policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)),
topologylink_(new RdcTopologyLinkImpl(group_settings_, metric_fetcher_)) {
policy_(new RdcPolicyImpl(group_settings_, metric_fetcher_)),
topologylink_(new RdcTopologyLinkImpl(group_settings_, metric_fetcher_)),
config_handler_(new RdcConfigSettingsImpl(group_settings_)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO");
metrics_updater_->start();
@@ -199,7 +202,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index,
return status;
}
rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
rdc_component_t component, rdc_component_version_t* p_rdc_compv) {
if (!p_rdc_compv) {
return RDC_ST_BAD_PARAMETER;
}
@@ -211,7 +215,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(rdc_component_
ret = amdsmi_get_lib_version(&ver);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: " << (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown"));
RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: "
<< (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown"));
return RDC_ST_MSI_ERROR;
}
@@ -383,7 +388,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response, rdc_diag_callback_t* callback) {
rdc_diag_response_t* response,
rdc_diag_callback_t* callback) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
@@ -400,7 +406,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) {
if (!result) {
return RDC_ST_BAD_PARAMETER;
}
@@ -428,7 +435,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(uint32_t wait_for_update)
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// Just write an empty function to solve compilation errors
rdc_status_t RdcEmbeddedHandler::get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
rdc_status_t RdcEmbeddedHandler::get_mixed_component_version(
mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
(void)(component);
(void)(p_mixed_compv);
return RDC_ST_OK;
@@ -463,8 +471,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id)
}
// Health API
rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id,
unsigned int components) {
rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) {
if (0 == components) {
return RDC_ST_BAD_PARAMETER;
}
@@ -473,7 +480,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id,
}
rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id,
unsigned int *components) {
unsigned int* components) {
if (components == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -482,7 +489,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id,
}
rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
rdc_health_response_t* response) {
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -491,7 +498,6 @@ rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id,
}
rdc_status_t RdcEmbeddedHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
return watch_table_->rdc_health_clear(group_id);
}
@@ -504,5 +510,22 @@ rdc_status_t RdcEmbeddedHandler::rdc_link_status_get(rdc_link_status_t* results)
return topologylink_->rdc_link_status_get(results);
}
// Set one configure
rdc_status_t RdcEmbeddedHandler::rdc_config_set(rdc_gpu_group_t group_id,
rdc_config_setting_t setting) {
return config_handler_->rdc_config_set(group_id, setting);
}
// Get the setting
rdc_status_t RdcEmbeddedHandler::rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) {
return config_handler_->rdc_config_get(group_id, settings);
}
// Clear the setting
rdc_status_t RdcEmbeddedHandler::rdc_config_clear(rdc_gpu_group_t group_id) {
return config_handler_->rdc_config_clear(group_id);
}
} // namespace rdc
} // namespace amd
+66 -3
Просмотреть файл
@@ -702,6 +702,69 @@ rdc_status_t RdcStandaloneHandler::rdc_field_update_all(uint32_t wait_for_update
return error_handle(status, reply.status());
}
// Set one configure
rdc_status_t RdcStandaloneHandler::rdc_config_set(rdc_gpu_group_t group_id,
rdc_config_setting_t setting) {
::rdc::SetConfigRequest request;
::rdc::SetConfigResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
::rdc::rdc_config_setting* setting_ref = (::rdc::rdc_config_setting*)request.mutable_setting();
setting_ref->set_type(static_cast<::rdc::rdc_config_type>(setting.type));
setting_ref->set_target_value(setting.target_value);
::grpc::Status status = stub_->SetConfig(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
return err_status;
}
// Get the setting
rdc_status_t RdcStandaloneHandler::rdc_config_get(rdc_gpu_group_t group_id,
rdc_config_setting_list_t* settings) {
int i = 0;
::rdc::GetConfigRequest request;
::rdc::GetConfigResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
::grpc::Status status = stub_->GetConfig(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
auto res = reply.settings();
if (reply.settings_size() > RDC_MAX_CONFIG_SETTINGS) return RDC_ST_MAX_LIMIT;
for (i = 0; i < reply.settings_size() && i < RDC_MAX_CONFIG_SETTINGS; ++i) {
const ::rdc::rdc_config_setting& result = reply.settings(i);
settings->settings[i].type = static_cast<rdc_config_type_t>(result.type());
settings->settings[i].target_value = result.target_value();
}
settings->total_settings = (reply.settings_size() >= RDC_MAX_CONFIG_SETTINGS)
? RDC_MAX_CONFIG_SETTINGS
: reply.settings_size();
err_status = error_handle(status, reply.status());
return err_status;
}
// Clear the setting
rdc_status_t RdcStandaloneHandler::rdc_config_clear(rdc_gpu_group_t group_id) {
::rdc::ClearConfigRequest request;
::rdc::ClearConfigResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
::grpc::Status status = stub_->ClearConfig(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
return err_status;
}
// It is only an interface for the client under the GRPC framework and is not used as an RDC API.
rdc_status_t RdcStandaloneHandler::get_mixed_component_version(
mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
@@ -804,7 +867,7 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id,
}
// no registered callback, start the thread to read the stream from rdcd
struct policy_thread_context ctx = {true,nullptr};
struct policy_thread_context ctx = {true, nullptr};
ctx.t = new std::thread([this, group_id, callback]() {
// call rdcd
@@ -905,7 +968,7 @@ rdc_status_t RdcStandaloneHandler::rdc_health_get(rdc_gpu_group_t group_id,
}
rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
rdc_health_response_t* response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
@@ -931,7 +994,7 @@ rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id,
to_result.component = static_cast<rdc_health_system_t>(result.component());
to_result.health = static_cast<rdc_health_result_t>(result.health());
//set error
// set error
to_result.error.code = result.error().code();
strncpy_with_null(to_result.error.msg, result.error().msg().c_str(), MAX_HEALTH_MSG_LENGTH);
}
+1
Просмотреть файл
@@ -69,6 +69,7 @@ set(RDCI_SRC_LIST
"${SRC_DIR}/RdciStatsSubSystem.cc"
"${SRC_DIR}/RdciPolicySubSystem.cc"
"${SRC_DIR}/RdciHealthSubSystem.cc"
"${SRC_DIR}/RdciConfigSubSystem.cc"
"${SRC_DIR}/RdciSubSystem.cc"
"${SRC_DIR}/RdciTopologyLinkSubSystem.cc"
"${SRC_DIR}/rdci.cc")
+59
Просмотреть файл
@@ -0,0 +1,59 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDCI_INCLUDE_RDCICONFIGSUBSYSTEM_H_
#define RDCI_INCLUDE_RDCICONFIGSUBSYSTEM_H_
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciConfigSubSystem : public RdciSubSystem {
public:
RdciConfigSubSystem();
~RdciConfigSubSystem() override;
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
typedef enum {
CONFIG_COMMAND_NONE = 0,
CONFIG_COMMAND_SET,
CONFIG_COMMAND_GET,
CONFIG_COMMAND_CLEAR,
CONFIG_COMMAND_HELP,
} config_command_type_t;
private:
void show_help() const;
void display_config_settings(rdc_config_setting_list_t& rdc_configs_list);
config_command_type_t config_cmd_;
static constexpr rdc_field_grp_t JOB_FIELD_ID = 1;
uint32_t group_id_;
uint32_t power_limit_;
uint64_t gfx_max_clock_;
uint64_t memory_max_clock_;
rdc_field_grp_t fgid_;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCICONFIGSUBSYSTEM_H_
+391
Просмотреть файл
@@ -0,0 +1,391 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "RdciConfigSubSystem.h"
#include <getopt.h>
#include <unistd.h>
#include <algorithm>
#include <iomanip>
#include <sstream>
#include <unordered_map>
#include <vector>
#include "common/rdc_utils.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/rdc_common.h"
static constexpr uint32_t TABLE_COLUMN_WIDTH = 20;
namespace amd {
namespace rdc {
RdciConfigSubSystem::RdciConfigSubSystem()
: config_cmd_(CONFIG_COMMAND_NONE),
power_limit_(0),
gfx_max_clock_(0),
memory_max_clock_(0),
fgid_(0) {}
RdciConfigSubSystem::~RdciConfigSubSystem() {
if (fgid_ != JOB_FIELD_ID) {
rdc_field_unwatch(rdc_handle_, group_id_, fgid_);
rdc_group_field_destroy(rdc_handle_, fgid_);
fgid_ = JOB_FIELD_ID;
}
}
void RdciConfigSubSystem::parse_cmd_opts(int argc, char** argv) {
const int JSON_OPTIONS = 1001;
const struct option long_options[] = {{"set", no_argument, nullptr, 's'},
{"get", no_argument, nullptr, 't'},
{"clear", no_argument, nullptr, 'c'},
{"unauth", optional_argument, nullptr, 'u'},
{"group", required_argument, nullptr, 'g'},
{"powerlimit", required_argument, nullptr, 'p'},
{"gfxmaxclk", required_argument, nullptr, 'x'},
{"memmaxclk", required_argument, nullptr, 'm'},
{"help", optional_argument, nullptr, 'h'},
{"json", optional_argument, nullptr, JSON_OPTIONS},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
int opt = 0;
config_cmd_ = CONFIG_COMMAND_NONE;
bool group_id_set = false; // ensure set, get, and clear have a group associated with them
while ((opt = getopt_long(argc, argv, "stcuhg:p:x:m", long_options, &option_index)) != -1) {
switch (opt) {
case 's':
config_cmd_ = CONFIG_COMMAND_SET;
break;
case 't':
config_cmd_ = CONFIG_COMMAND_GET;
break;
case 'c':
config_cmd_ = CONFIG_COMMAND_CLEAR;
break;
case 'u':
use_auth_ = false;
break;
case 'h':
config_cmd_ = CONFIG_COMMAND_HELP;
break;
case 'g':
group_id_ = std::stoi(optarg);
group_id_set = true;
break;
case 'p':
power_limit_ = std::stoul(optarg);
break;
case 'x':
gfx_max_clock_ = std::stoul(optarg);
break;
case 'm':
memory_max_clock_ = std::stoul(optarg);
break;
case JSON_OPTIONS:
set_json_output(true);
break;
default:
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
}
}
if (config_cmd_ == CONFIG_COMMAND_NONE) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Must specify a valid operations");
}
// Enforce a mandatory group id for set, get, and clear
if ((config_cmd_ == CONFIG_COMMAND_SET || config_cmd_ == CONFIG_COMMAND_GET ||
config_cmd_ == CONFIG_COMMAND_CLEAR) &&
!group_id_set) {
show_help();
throw RdcException(
RDC_ST_BAD_PARAMETER,
"Must specify a group ID (-g or --group) for set, get, and clear operations");
}
}
void RdciConfigSubSystem::show_help() const {
if (is_json_output()) return;
std::cout << " config -- Used to configure GPU to have configuration across workloads and across "
"devices.\n\n";
std::cout << "Usage\n";
std::cout << " rdci config --help\n";
std::cout << " rdci config [-g <group id>] --set [--powerlimit <value>] [--gfxmaxclk "
"<value>] [--memmaxclk <value>]\n";
std::cout << " rdci config [-g <group id>] --get\n";
std::cout << " rdci config [-g <group id>] --clear\n";
show_common_usage();
}
void RdciConfigSubSystem::process() {
rdc_status_t result = RDC_ST_UNKNOWN_ERROR;
std::ostringstream json_ss;
switch (config_cmd_) {
case CONFIG_COMMAND_SET: {
rdc_config_setting_t setting;
if (gfx_max_clock_ != 0) {
setting.type = RDC_CFG_GFX_CLOCK_LIMIT;
setting.target_value = gfx_max_clock_;
result = rdc_config_set(rdc_handle_, group_id_, setting);
}
if (power_limit_ != 0) {
setting.type = RDC_CFG_POWER_LIMIT;
setting.target_value = power_limit_;
result = rdc_config_set(rdc_handle_, group_id_, setting);
}
if (memory_max_clock_ != 0) {
setting.type = RDC_CFG_MEMORY_CLOCK_LIMIT;
setting.target_value = memory_max_clock_;
result = rdc_config_set(rdc_handle_, group_id_, setting);
}
if (result == RDC_ST_OK) {
if (is_json_output()) {
json_ss << "{"
<< "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\""
<< "}";
} else {
std::cout << "Successfully configured GPU Id belongs to group: " << group_id_
<< std::endl;
}
std::cout << json_ss.str() << std::endl;
return;
}
break;
}
case CONFIG_COMMAND_GET: {
// Add the default job stats fields
std::vector<rdc_field_t> job_fields = {RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK,
RDC_FI_POWER_USAGE};
static const char job_field_group[] = "RdciConfigSubSystem";
result = rdc_group_field_create(rdc_handle_, job_fields.size(), job_fields.data(),
job_field_group, &fgid_);
// Start watch
const double max_keep_age = 30060; // Length of time to keep data in field in seconds
const int max_keep_samples = 10;
const int update_frequency = 1000000; // Once per minute
result = rdc_field_watch(rdc_handle_, group_id_, fgid_, update_frequency, max_keep_age,
max_keep_samples);
rdc_config_setting_list_t settings = {0, {}};
result = rdc_config_get(rdc_handle_, group_id_, &settings);
if (result == RDC_ST_OK) {
display_config_settings(settings);
} else if (result == RDC_ST_NOT_FOUND) {
std::cout << "Get config information failed, cache empty " << std::endl;
} else {
std::cout << "Get config information failed " << std::endl;
}
// Stop watching the field group
result = rdc_field_unwatch(rdc_handle_, group_id_, fgid_);
if (result != RDC_ST_OK) {
std::cout << "Error stop watch fields. Return: " << rdc_status_string(result);
}
std::cout << "Stop watch group:" << group_id_ << ", field_group:" << fgid_ << std::endl;
// Delete the field group and GPU group
result = rdc_group_field_destroy(rdc_handle_, fgid_);
if (result != RDC_ST_OK) {
std::cout << "Error delete field group. Return: " << rdc_status_string(result);
}
std::cout << "Deleted the field group " << fgid_ << std::endl;
break;
}
case CONFIG_COMMAND_CLEAR: {
result = rdc_config_clear(rdc_handle_, group_id_);
if (result == RDC_ST_OK) {
if (is_json_output()) {
json_ss << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\"";
} else {
std::cout << "Successfully cleared all configurationbelongs for group: " << group_id_
<< std::endl;
}
std::cout << json_ss.str() << std::endl;
return;
}
break;
}
case CONFIG_COMMAND_HELP:
show_help();
result = RDC_ST_OK;
break;
case CONFIG_COMMAND_NONE:
default:
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command");
}
if (result != RDC_ST_OK) {
throw RdcException(result, rdc_status_string(result));
}
}
void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc_configs_list) {
rdc_status_t result = RDC_ST_OK;
std::stringstream ss, json_ss;
rdc_group_info_t rdc_group_info = {0, "", {0}};
uint32_t gpu_index = 0;
uint32_t index = 0;
ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "configure" << std::setw(TABLE_COLUMN_WIDTH)
<< std::left << "gpu_index" << std::setw(TABLE_COLUMN_WIDTH) << std::left << "config_limit"
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << "current_value" << std::endl;
json_ss << "\"group_id\": " << group_id_ << ","
<< "\"config_list\" : [";
result = rdc_group_gpu_get_info(rdc_handle_, group_id_, &rdc_group_info);
if (result == RDC_ST_OK) {
std::vector<uint32_t> group_index_array(rdc_group_info.entity_ids,
rdc_group_info.entity_ids + rdc_group_info.count);
sort(begin(group_index_array), end(group_index_array));
for (uint32_t i = 0; i < rdc_configs_list.total_settings && i < RDC_GROUP_MAX_ENTITIES; ++i) {
auto type = rdc_configs_list.settings[i].type;
auto config_value = rdc_configs_list.settings[i].target_value;
for (gpu_index = 0; gpu_index < rdc_group_info.count && gpu_index < RDC_GROUP_MAX_ENTITIES;
++gpu_index) {
json_ss << "{\"gpu_index\": " << group_index_array[gpu_index] << ",";
rdc_field_value value;
switch (type) {
case RDC_CFG_GFX_CLOCK_LIMIT: {
json_ss << "\"GFX Clock Limit\":" << config_value;
ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "gfx_clock_limit"
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << group_index_array[gpu_index]
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << config_value
<< std::setw(TABLE_COLUMN_WIDTH) << std::left;
result = rdc_field_get_latest_value(rdc_handle_, group_index_array[gpu_index],
RDC_FI_GPU_CLOCK, &value);
json_ss << ",\"GFX Clock Current Value\":";
if (result != RDC_ST_OK) {
ss << "N/A";
json_ss << "\"N/A\"";
} else {
if (value.type == INTEGER) {
ss << value.value.l_int;
json_ss << value.value.l_int;
} else if (value.type == DOUBLE) {
ss << std::fixed << std::setprecision(3) << value.value.dbl;
json_ss << value.value.dbl;
} else {
ss << value.value.str;
json_ss << value.value.str;
}
}
ss << std::endl;
break;
}
case RDC_CFG_MEMORY_CLOCK_LIMIT:
json_ss << "\"Memory Clock Limit\":" << config_value;
ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "memory_clock_limit"
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << group_index_array[gpu_index]
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << config_value
<< std::setw(TABLE_COLUMN_WIDTH) << std::left;
result = rdc_field_get_latest_value(rdc_handle_, group_index_array[gpu_index],
RDC_FI_MEM_CLOCK, &value);
json_ss << ",\"Memory Clock Current Value\":";
if (result != RDC_ST_OK) {
ss << "N/A";
json_ss << "\"N/A\"";
} else {
if (value.type == INTEGER) {
ss << value.value.l_int;
json_ss << value.value.l_int;
} else if (value.type == DOUBLE) {
ss << std::fixed << std::setprecision(3) << value.value.dbl;
json_ss << value.value.dbl;
} else {
ss << value.value.str;
json_ss << value.value.str;
}
}
ss << std::endl;
break;
case RDC_CFG_POWER_LIMIT:
json_ss << "\"Power Limit\":" << config_value;
ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "power_limit"
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << group_index_array[gpu_index]
<< std::setw(TABLE_COLUMN_WIDTH) << std::left << config_value
<< std::setw(TABLE_COLUMN_WIDTH) << std::left;
result = rdc_field_get_latest_value(rdc_handle_, group_index_array[gpu_index],
RDC_FI_POWER_USAGE, &value);
json_ss << ",\"Power Current Value\":";
if (result != RDC_ST_OK) {
ss << "N/A";
json_ss << "\"N/A\"";
} else {
if (value.type == INTEGER) {
double watts = static_cast<double>(value.value.l_int) / 1'000'000;
ss << std::fixed << std::setprecision(3) << watts;
json_ss << watts;
} else if (value.type == DOUBLE) {
double watts = value.value.dbl / 1'000'000;
ss << std::fixed << std::setprecision(3) << watts;
json_ss << watts;
} else {
ss << value.value.str;
json_ss << value.value.str;
}
}
ss << std::endl;
break;
default:
break;
}
// Set the json seperator
json_ss << "}";
if ((gpu_index + 1) != rdc_group_info.count) {
json_ss << ",";
}
}
if (rdc_group_info.count != 0 && i < rdc_configs_list.total_settings - 1 &&
i < RDC_GROUP_MAX_ENTITIES - 1) {
json_ss << ",";
}
}
}
if (index != 0) {
json_ss << "}";
}
json_ss << "]";
if (is_json_output()) {
std::cout << json_ss.str() << std::endl;
} else {
std::cout << ss.str() << std::endl;
}
}
} // namespace rdc
} // namespace amd
+18 -10
Просмотреть файл
@@ -20,19 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <iostream>
#include <string>
#include <string.h>
#include <iostream>
#include <string>
#include "RdciConfigSubSystem.h"
#include "RdciDiagSubSystem.h"
#include "RdciDiscoverySubSystem.h"
#include "RdciDmonSubSystem.h"
#include "RdciFieldGroupSubSystem.h"
#include "RdciGroupSubSystem.h"
#include "RdciStatsSubSystem.h"
#include "RdciPolicySubSystem.h"
#include "RdciHealthSubSystem.h"
#include "RdciTopologyLinkSubSystem.h"
#include "RdciPolicySubSystem.h"
#include "RdciStatsSubSystem.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/rdc_common.h"
@@ -42,8 +44,11 @@ THE SOFTWARE.
#define RDC_CLIENT_VERSION_RELEASE 0
#define RDC_CLIENT_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE)
#define RDC_CLIENT_VERSION_EXPAND_PARTS(MAJOR_STR, MINOR_STR, RELEASE_STR) RDC_CLIENT_VERSION_CREATE_STRING(MAJOR_STR, MINOR_STR, RELEASE_STR)
#define RDC_CLIENT_VERSION_STRING RDC_CLIENT_VERSION_EXPAND_PARTS(RDC_CLIENT_VERSION_MAJOR, RDC_CLIENT_VERSION_MINOR, RDC_CLIENT_VERSION_RELEASE)
#define RDC_CLIENT_VERSION_EXPAND_PARTS(MAJOR_STR, MINOR_STR, RELEASE_STR) \
RDC_CLIENT_VERSION_CREATE_STRING(MAJOR_STR, MINOR_STR, RELEASE_STR)
#define RDC_CLIENT_VERSION_STRING \
RDC_CLIENT_VERSION_EXPAND_PARTS(RDC_CLIENT_VERSION_MAJOR, RDC_CLIENT_VERSION_MINOR, \
RDC_CLIENT_VERSION_RELEASE)
#define Q(x) #x
#define QUOTE(x) Q(x)
@@ -52,7 +57,7 @@ int main(int argc, char** argv) {
const std::string usage_help =
"Usage:\trdci <subsystem>|<options>\n"
"subsystem: \n"
" discovery, dmon, group, fieldgroup, stats, diag, policy, health, topo\n"
" discovery, dmon, group, fieldgroup, stats, diag, config, policy, health, topo\n"
"options: \n"
" -v(--version) : Print client version information only\n";
@@ -63,11 +68,12 @@ int main(int argc, char** argv) {
if (strcmp(argv[1], "-v") == 0 || strcmp(argv[1], "--version") == 0) {
#ifdef CURRENT_GIT_HASH
std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << "+" << QUOTE(CURRENT_GIT_HASH) << std::endl;
std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << "+" << QUOTE(CURRENT_GIT_HASH)
<< std::endl;
#else
std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << std::endl;
std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << std::endl;
#endif
exit(0);
exit(0);
}
amd::rdc::RdciSubSystemPtr subsystem;
@@ -91,6 +97,8 @@ int main(int argc, char** argv) {
subsystem.reset(new amd::rdc::RdciStatsSubSystem());
} else if (subsystem_name == "policy") {
subsystem.reset(new amd::rdc::RdciPolicySubSystem());
} else if (subsystem_name == "config") {
subsystem.reset(new amd::rdc::RdciConfigSubSystem());
} else {
std::cout << usage_help;
exit(0);
+14 -6
Просмотреть файл
@@ -22,10 +22,10 @@ THE SOFTWARE.
#ifndef SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_
#define SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_
#include <thread>
#include <grpcpp/server_context.h>
#include <thread>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc.h"
@@ -157,12 +157,10 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::GetTopologyRequest* request,
::rdc::GetTopologyResponse* reply) override;
::grpc::Status SetHealth(::grpc::ServerContext* context,
const ::rdc::SetHealthRequest* request,
::grpc::Status SetHealth(::grpc::ServerContext* context, const ::rdc::SetHealthRequest* request,
::rdc::SetHealthResponse* reply) override;
::grpc::Status GetHealth(::grpc::ServerContext* context,
const ::rdc::GetHealthRequest* request,
::grpc::Status GetHealth(::grpc::ServerContext* context, const ::rdc::GetHealthRequest* request,
::rdc::GetHealthResponse* reply) override;
::grpc::Status CheckHealth(::grpc::ServerContext* context,
@@ -173,6 +171,16 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::ClearHealthRequest* request,
::rdc::ClearHealthResponse* reply) override;
::grpc::Status SetConfig(::grpc::ServerContext* context, const ::rdc::SetConfigRequest* request,
::rdc::SetConfigResponse* reply) override;
::grpc::Status GetConfig(::grpc::ServerContext* context, const ::rdc::GetConfigRequest* request,
::rdc::GetConfigResponse* reply) override;
::grpc::Status ClearConfig(::grpc::ServerContext* context,
const ::rdc::ClearConfigRequest* request,
::rdc::ClearConfigResponse* reply) override;
private:
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
rdc_handle_t rdc_handle_;
+42 -1
Просмотреть файл
@@ -1009,7 +1009,7 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
to_incidents->set_component(incident.component);
to_incidents->set_health(incident.health);
//error
// error
auto to_error = to_incidents->mutable_error();
to_error->set_code(incident.error.code);
to_error->set_msg(incident.error.msg);
@@ -1062,6 +1062,47 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
static_cast<::rdc::TopologyLinkInfo_LinkType>(topology_results.link_infos[i].link_type));
linkinfos->set_p2p_accessible(topology_results.link_infos[i].is_p2p_accessible);
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::SetConfig(::grpc::ServerContext* context,
const ::rdc::SetConfigRequest* request,
::rdc::SetConfigResponse* reply) {
(void)(context);
rdc_config_setting_t setting;
::rdc::rdc_config_setting setting_ref = request->setting();
setting.type = static_cast<rdc_config_type_t>(setting_ref.type());
setting.target_value = setting_ref.target_value();
rdc_status_t status = rdc_config_set(rdc_handle_, request->group_id(), setting);
reply->set_status(static_cast<::uint32_t>(status));
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetConfig(::grpc::ServerContext* context,
const ::rdc::GetConfigRequest* request,
::rdc::GetConfigResponse* reply) {
(void)(context);
rdc_config_setting_list_t settings;
rdc_status_t status = rdc_config_get(rdc_handle_, request->group_id(), &settings);
reply->set_status(status);
for (uint32_t i = 0; i < settings.total_settings && i < RDC_MAX_CONFIG_SETTINGS; ++i) {
auto result = reply->add_settings();
result->set_type(static_cast<::rdc::rdc_config_type>(settings.settings[i].type));
result->set_target_value(settings.settings[i].target_value);
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::ClearConfig(::grpc::ServerContext* context,
const ::rdc::ClearConfigRequest* request,
::rdc::ClearConfigResponse* reply) {
(void)(context);
rdc_status_t status = rdc_config_clear(rdc_handle_, request->group_id());
reply->set_status(status);
return ::grpc::Status::OK;
}