From 7f7cf5c1db3c33ba194625ad2ff45a8cf79fda5f Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 19 Feb 2020 15:19:24 -0500 Subject: [PATCH] Support discovery and group management in rdc_lib The rdc.h is modified for new discovery and grouping APIs. The RdcGroupSettingsImpl.cc is added to implement the GPU group and the field group management. The RdcMetricFetcherImpl.cc is added to fetch the metrics from rocm_smi_lib. Currently, only support power, memory, GPU utilization, temperature, GPU clock, total device and device name. A new example field_value_example.cc is added to demo how to record the fields and retrieve data from cache. Change-Id: I57acfa048fe9b3d848e2d441e768b3a63ccae3f8 [ROCm/rdc commit: a5f063f8b3342c54a2958b438d68b78bdd4289f0] --- projects/rdc/example/CMakeLists.txt | 10 + projects/rdc/example/field_value_example.cc | 276 +++++++++++ projects/rdc/example/job_stats_example.cc | 17 +- projects/rdc/include/rdc/rdc.h | 437 ++++++++++++++---- .../rdc/include/rdc_lib/RdcGroupSettings.h | 62 +++ projects/rdc/include/rdc_lib/RdcHandler.h | 51 +- .../rdc/include/rdc_lib/RdcMetricFetcher.h | 45 ++ .../include/rdc_lib/impl/RdcEmbeddedHandler.h | 60 ++- .../rdc_lib/impl/RdcGroupSettingsImpl.h | 71 +++ .../rdc_lib/impl/RdcMetricFetcherImpl.h | 39 ++ projects/rdc/include/rdc_lib/rdc_common.h | 16 + projects/rdc/rdc_libs/CMakeLists.txt | 11 +- .../rdc_libs/bootstrap/src/RdcBootStrap.cc | 175 ++++++- .../rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 196 ++++++-- .../rdc_libs/rdc/src/RdcGroupSettingsImpl.cc | 152 ++++++ .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 110 +++++ 16 files changed, 1558 insertions(+), 170 deletions(-) create mode 100644 projects/rdc/example/field_value_example.cc create mode 100644 projects/rdc/include/rdc_lib/RdcGroupSettings.h create mode 100644 projects/rdc/include/rdc_lib/RdcMetricFetcher.h create mode 100644 projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h create mode 100644 projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h create mode 100644 projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc create mode 100644 projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc diff --git a/projects/rdc/example/CMakeLists.txt b/projects/rdc/example/CMakeLists.txt index 0f2445aa20..1ffda1b83f 100755 --- a/projects/rdc/example/CMakeLists.txt +++ b/projects/rdc/example/CMakeLists.txt @@ -76,6 +76,16 @@ add_executable(${JOBSTATS_EXAMPLE_EXE} "${JOBSTATS_EXAMPLE_SRC_LIST}") target_link_libraries(${JOBSTATS_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(FIELDVALUE_EXAMPLE_SRC_LIST "${SRC_DIR}/field_value_example.cc") +message("FIELDVALUE_EXAMPLE_SRC_LIST=${FIELDVALUE_EXAMPLE_SRC_LIST}") +set(FIELDVALUE_EXAMPLE_EXE "fieldvalue") + +link_directories(${LIB_BOOSTRAP_DIR}) + +add_executable(${FIELDVALUE_EXAMPLE_EXE} "${FIELDVALUE_EXAMPLE_SRC_LIST}") + +target_link_libraries(${FIELDVALUE_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") diff --git a/projects/rdc/example/field_value_example.cc b/projects/rdc/example/field_value_example.cc new file mode 100644 index 0000000000..1fb7db9b33 --- /dev/null +++ b/projects/rdc/example/field_value_example.cc @@ -0,0 +1,276 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include "rdc/rdc.h" + +int main(int, char **) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"group1"}; + char field_group_name[] = {"fieldgroup1"}; + uint64_t since_timestamp = 0; + uint64_t next_timestamp = 0; + uint64_t start_timestamp = 0; + uint32_t count = 0; + + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone? + "Standalone mode selected.\n":"Embedded mode selected.\n"); + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << + rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle); + if ( result != RDC_ST_OK ) { + std::cout << "Error connecting to remote rdcd. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // Get the list of devices in the system + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + result = rdc_get_all_devices(rdc_handle, gpu_index_list, &count); + if (result != RDC_ST_OK) { + std::cout << "Error to find devices on the system. Return: " + << rdc_status_string(result); + goto cleanup; + } + if (count == 0) { + std::cout << "No GPUs find on the sytem "; + goto cleanup; + } else { + std::cout << count << " GPUs found in the system.\n"; + } + + // Create the group + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, + group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Created the GPU group " << group_id << std::endl; + + // Add all GPUs to the group + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, + group_id, gpu_index_list[i]); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " + << rdc_status_string(result); + goto cleanup; + } + rdc_device_attributes_t attribute; + result = rdc_get_device_attributes(rdc_handle, + gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Add GPU " <(time(nullptr)-10)*1000; + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < field_info.count; findex++) { + since_timestamp = start_timestamp; + while (true) { + rdc_field_value value; + result = rdc_get_field_value_since(rdc_handle, + group_info.entity_ids[gindex] , field_info.field_ids[findex], + since_timestamp, &next_timestamp, &value); + if (result == RDC_ST_NOT_FOUND) { + break; + } + if (result != RDC_ST_OK) { + std::cout << "Error get history data. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << value.ts <<"\t" << group_info.entity_ids[gindex] + << "\t\t" << std::left << std::setw(16) + << field_id_string(value.field_id) << "\t" + << value.value.l_int << std::endl; + since_timestamp = next_timestamp; + } // while + } // for findex + } // for gindex + + // Delete the field group and GPU group + result = rdc_group_field_destroy(rdc_handle, field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete field group. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the field group " << field_group_id << std::endl; + + result = rdc_group_gpu_destroy(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete GPU group. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the GPU group " << group_id << std::endl; + + + // Cleanup consists of shutting down RDC. + cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} diff --git a/projects/rdc/example/job_stats_example.cc b/projects/rdc/example/job_stats_example.cc index 8aab3f039f..a4a5135c96 100644 --- a/projects/rdc/example/job_stats_example.cc +++ b/projects/rdc/example/job_stats_example.cc @@ -94,15 +94,8 @@ int main(int, char **) { // (2) start the recording. Set the sample frequency to once per second, the // max keep age to one hour and the maximum number of samples to // keep to unlimited. - result = rdc_watch_job_fields(rdc_handle, group_id, 1000000, 3600, 0); - if (result != RDC_ST_OK) { - std::cout << "Error watch job fileds. Return: " - << rdc_status_string(result); - goto cleanup; - } - - // (3) Start a Slurm job on this group - result = rdc_job_start_stats(rdc_handle, group_id, job_id); + result = rdc_job_start_stats(rdc_handle, group_id, + job_id, 1000000, 3600, 0); if (result != RDC_ST_OK) { std::cout << "Error start job stats. Return: " << rdc_status_string(result); @@ -126,7 +119,7 @@ int main(int, char **) { usleep(5000000); // sleep 5 seconds before fetch the stats } - // (4) stop the Slurm job, which will stop the watch + // (3) stop the Slurm job, which will stop the watch // We do not have to stop the job to get stats. The rdc_job_get_stats can be // called at any time before stop result = rdc_job_stop_stats(rdc_handle, job_id); @@ -136,7 +129,7 @@ int main(int, char **) { goto cleanup; } - // (5) Get the stats + // (4) Get the stats rdc_job_info_t job_info; result = rdc_job_get_stats(rdc_handle, job_id, &job_info); @@ -178,7 +171,7 @@ int main(int, char **) { std::cout << "No data for job stats found." << std::endl; } - // Cleanup consists of shutting down DCGM. + // Cleanup consists of shutting down RDC. cleanup: std::cout << "Cleaning up.\n"; if (standalone) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index f1d1a73f09..3b54e7abe8 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -31,30 +31,33 @@ THE SOFTWARE. * in this file. * * @brief The rocm_rdc library api is new, and therefore subject to change - * either at the ABI or API level. Instead of marking every function prototype as "unstable", - * we areinstead saying the API is unstable (i.e., changes are possible) while the - * major version remains 0. This means that if the API/ABI changes, we will - * not increment the major version to 1. Once the ABI stabilizes, we will - * increment the major version to 1, and thereafter increment it on all ABI - * breaks. + * either at the ABI or API level. Instead of marking every function prototype + * as "unstable", we areinstead saying the API is unstable (i.e., changes + * are possible) while the major version remains 0. This means that if the + * API/ABI changes, we will not increment the major version to 1. Once the + * ABI stabilizes, we will increment the major version to 1, and thereafter + * increment it on all ABI breaks. */ /** - * @brief Error codes retured by rocm_rdc_lib functions + * @brief Error codes returned by rocm_rdc_lib functions */ typedef enum { RDC_ST_OK = 0, RDC_ST_NOT_SUPPORTED, //!< Not supported feature RDC_ST_MSI_ERROR, //!< The MSI library error RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library - RDC_ST_INVALID_HANDLER //!< Fail to load the library + RDC_ST_INVALID_HANDLER, //!< Invalid handler + RDC_ST_BAD_PARAMETER, //!< A parameter is invalid + RDC_ST_NOT_FOUND, //!< Cannot find the value + RDC_ST_MAX_LIMIT //!< Max limit recording for the object } rdc_status_t; /** * @brief rdc operation mode * rdc can run in auto mode where background threads will collect metrics. - * When run in manual mode, the user needs to periodically call rdc_update_all_fields - * for data collection. + * When run in manual mode, the user needs to periodically call + * rdc_update_all_fields for data collection. */ typedef enum { RDC_OPERATION_MODE_AUTO = 0, @@ -94,6 +97,11 @@ typedef enum { */ #define RDC_GROUP_MAX_ENTITIES 64 +/** + * @brief Max number of GPUs supported by RDC + */ +#define RDC_MAX_NUM_DEVICES 16 + /** * @brief The max fields in a field group */ @@ -134,6 +142,17 @@ typedef enum { */ #define RDC_FI_GPU_TEMP 150 +/** + * GPU count in the system + */ +#define RDC_FI_GPU_COUNT 4 + +/** + * Name of the device + */ +#define RDC_FI_DEV_NAME 50 + + /** * @brief handlers used in various rdc calls */ @@ -141,6 +160,25 @@ typedef void *rdc_handle_t; typedef uint32_t rdc_gpu_group_t; typedef uint32_t rdc_field_grp_t; +/** + * @brief Represents attributes corresponding to a device + */ +typedef struct { + char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device. +} rdc_device_attributes_t; + +/** + * @brief The structure to store the group info + */ +typedef struct { + unsigned int count; //!< count of GPUs in the group + char group_name[RDC_MAX_STR_LENGTH]; //!< group name + /** + * The list of entities in the group + */ + uint32_t entity_ids[RDC_GROUP_MAX_ENTITIES]; +} rdc_group_info_t; + /** * @brief The structure to store summary of data */ @@ -154,7 +192,7 @@ typedef struct { * @brief The structure to hold the GPU usage information */ typedef struct { - uint32_t gpu_id; //!< GPU_ID_INVALID for summary information + uint32_t gpu_id; //!< GPU_ID_INVALID for summary information uint64_t start_time; //!< The time to start the watching uint64_t end_time; //!< The time to stop the watching @@ -180,7 +218,7 @@ typedef struct { * @brief The structure to store the field value */ typedef struct { - uint16_t field_id; //!< The field id of the value + uint32_t field_id; //!< The field id of the value int status; //!< RDC_ST_OK or error status uint64_t ts; //!< Timestamp in usec since 1970 rdc_field_type_t type; //!< The field type @@ -238,9 +276,9 @@ rdc_status_t rdc_shutdown(); * @brief Start embedded RDC agent within this process. * * @details The RDC is loaded as library so that it does not require rdcd - * daemon. In this mode, the user has to periodically call rdc_update_all_fields() - * when op_mode is RDC_OPERATION_MODE_MANUAL, which tells RDC to collect - * the stats. This function is not thread safe. + * daemon. In this mode, the user has to periodically call + * rdc_update_all_fields() when op_mode is RDC_OPERATION_MODE_MANUAL, which + * tells RDC to collect the stats. This function is not thread safe. * * @param[in] op_mode Operation modes. When RDC_OPERATION_MODE_AUTO, RDC schedules * background task to collect the stats. When RDC_OPERATION_MODE_MANUAL, the user @@ -257,10 +295,11 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, /** * @brief Stop embedded RDC agent. * - * @details Stop the embedded RDC agent, and p_rdc_handle becomes invalid after - * this call. This function is not thread safe. + * @details Stop the embedded RDC agent, and p_rdc_handle becomes + * invalid after this call. This function is not thread safe. * - * @param[in] p_rdc_handle The RDC handler that come from rdc_start_embedded(). + * @param[in] p_rdc_handle The RDC handler that come from + * rdc_start_embedded(). * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle); @@ -271,12 +310,13 @@ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle); * @details This method is used to connect to a remote stand-alone rdcd daemon. * This function is not thread safe. * - * @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort can be - * specified in this x.x.x.x:yyyy format, where x.x.x.x is the IP address and - * yyyy is the port. + * @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort + * can be specified in this x.x.x.x:yyyy format, where x.x.x.x is the + * IP address and yyyy is the port. * * @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon - * successful call, the value will contain the handler for following API calls. + * successful call, the value will contain the handler + * for following API calls. * * @retval ::RDC_ST_OK is returned upon successful call. */ @@ -285,8 +325,8 @@ rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle); /** * @brief Disconnect from rdcd daemon. * - * @details Disconnect from rdcd daemon, and p_rdc_handle becomes invalid after - * this call. This function is not thread safe. + * @details Disconnect from rdcd daemon, and p_rdc_handle becomes invalid + * after this call. This function is not thread safe. * * @param[in] p_rdc_handle The RDC handler that come from rdc_connect(). * @@ -294,57 +334,20 @@ rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle); */ rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle); -/** - * @brief Create a group contains multiple GPUs - * - * @details This method can create a group contains multiple GPUs. Instead of - * executing an operation separately for each GPU, the RDC group enables - * the user to execute same operation on all the GPUs present in the group as a - * single API call. - * - * @param[in] p_rdc_handle The RDC handler. - * - * @param[in] type The type of the group. RDC_GROUP_DEFAULT includes all the GPUs - * on the node, and RDC_GROUP_EMPTY creates an empty group. - * - * @param[in] group_name The group name specified as NULL terminated C String - * - * @param[inout] p_rdc_group_id Caller provided pointer to rdc_gpu_group_t. Upon - * successful call, the value will contain the group id for following group API calls. - * - * @retval ::RDC_ST_OK is returned upon successful call. - */ -rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, - rdc_group_type_t type, const char* group_name, - rdc_gpu_group_t* p_rdc_group_id); - -/** - * @brief Add a GPU to the group - * - * @details This method can add a GPU to the group - * - * @param[in] p_rdc_handle The The RDC handler. - * - * @param[in] group_id The group id to which the GPU will be added. - * - * @param[in] gpu_index The GPU index to be added to the group. - * - * @retval ::RDC_ST_OK is returned upon successful call. - */ -rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, uint32_t gpu_index); - /** * @brief Request the RDC to watch the job stats * - * @details The summary job stats can be retrieved using rdc_job_get_stats() + * @details This should be executed as part of job prologue. The summary + * job stats can be retrieved using rdc_job_get_stats(). * In RDC_OPERATION_MODE_MANUAL, user must call rdc_update_all_fields(1) * at least once, before call rdc_job_get_stats() * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] group_id The group of GPUs to be watched. * + * @param[in] job_id The name of the job. + * * @param[in] update_freq How often to update this field in usec. * * @param[in] max_keep_age How long to keep data for this field in seconds. @@ -353,30 +356,17 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, * * @retval ::RDC_ST_OK is returned upon successful call. */ -rdc_status_t rdc_watch_job_fields(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples); - -/** - * @brief Request RDC a job to be started - * - * @details This should be execute as part of job prologue - * - * @param[in] p_rdc_handle The The RDC handler. - * - * @param[in] job_id The name of the job. - * - * @retval ::RDC_ST_OK is returned upon successful call. - */ rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, char job_id[64]); + rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples); /** * @brief Get the stats of the job using the job id. * - * @details The stats can be retrieved at any point when the job is in process. + * @details The stats can be retrieved at any point when the job is in + * process. * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] job_id The name of the job. * @@ -391,11 +381,11 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64], /** * @brief Request RDC to stop watching the stats of the job * - * @details This should be execute as part of job epilogue. The job Id remains - * available to view the stats at any point. You must call rdc_watch_job_fields() - * before this call. + * @details This should be execute as part of job epilogue. The job Id + * remains available to view the stats at any point. You must call + * rdc_watch_job_fields() before this call. * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] job_id The name of the job. * @@ -407,9 +397,10 @@ rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, /** * @brief Request RDC to update all fields to be watched. * - * @details In RDC_OPERATION_MODE_MANUAL, the user must call this method periodically. + * @details In RDC_OPERATION_MODE_MANUAL, the user must call this method + * periodically. * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] wait_for_update Whether or not to wait for the update loop to * complete before returning to the caller 1=wait. 0=do not wait. @@ -419,15 +410,279 @@ rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, rdc_status_t rdc_update_all_fields(rdc_handle_t p_rdc_handle, uint32_t wait_for_update); +/** + * @brief Get indexes corresponding to all the devices on the system. + * + * @details Indexes represents RDC GPU Id corresponding to each GPU on the + * system and is immutable during the lifespan of the engine. The list + * should be queried again if the engine is restarted. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[out] gpu_index_list Array reference to fill GPU indexes present on + * the system. + * + * @param[out] count Number of GPUs returned in gpu_index_list. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_all_devices(rdc_handle_t p_rdc_handle, + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count); + +/** + * @brief Gets device attributes corresponding to the gpu_index. + * + * @details Fetch the attributes, such as device name, of a GPU. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] gpu_index GPU index corresponding to which the attributes + * should be fetched + * + * @param[out] p_rdc_attr GPU attribute corresponding to the gpu_index. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_device_attributes(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr); + +/** + * @brief Create a group contains multiple GPUs + * + * @details This method can create a group contains multiple GPUs. Instead of + * executing an operation separately for each GPU, the RDC group enables + * the user to execute same operation on all the GPUs present in the group as + * a single API call. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] type The type of the group. RDC_GROUP_DEFAULT includes all the + * GPUs on the node, and RDC_GROUP_EMPTY creates an empty group. + * + * @param[in] group_name The group name specified as NULL terminated C String + * + * @param[inout] p_rdc_group_id Caller provided pointer to rdc_gpu_group_t. + * Upon successful call, the value will contain the group id for following + * group API calls. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, + rdc_group_type_t type, const char* group_name, + rdc_gpu_group_t* p_rdc_group_id); + +/** + * @brief Add a GPU to the group + * + * @details This method can add a GPU to the group + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The group id to which the GPU will be added. + * + * @param[in] gpu_index The GPU index to be added to the group. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, uint32_t gpu_index); + +/** + * @brief Get information about a GPU group + * + * @details Get detail information about a GPU group created by + * rdc_group_gpu_create + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] p_rdc_group_id The GPU group handler created by + * rdc_group_gpu_create + * + * @param[out] p_rdc_group_info The information of the GPU + * group p_rdc_group_id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info); + +/** + * @brief Destroy GPU group represented by p_rdc_group_id + * + * @details Delete the logic group represented by p_rdc_group_id + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] p_rdc_group_id The group id + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id); + +/** + * @brief create a group of fields + * + * @details The user can create a group of fields and perform an operation + * on a group of fields at once. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] num_field_ids Number of field IDs that are being provided + * in field_ids. + * + * @param[in] field_ids Field IDs to be added to the newly-created + * field group. + * + * @param[in] field_group_name Unique name for this group of fields. + * + * @param[out] rdc_field_group_id Handle to the newly-created field group + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, + uint32_t num_field_ids, uint32_t* field_ids, + const char* field_group_name, rdc_field_grp_t* rdc_field_group_id); + +/** + * @brief Get information about a field group + * + * @details Get detail information about a field group created by + * rdc_group_field_create + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] rdc_field_group_id The field group handler created by + * rdc_group_field_create + * + * @param[out] field_group_info The information of the field group + * rdc_field_group_id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info); + +/** + * @brief Destroy field group represented by rdc_field_group_id + * + * @details Delete the logic group represented by rdc_field_group_id + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] rdc_field_group_id The field group id + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id); + +/** + * @brief Request the RDC start recording updates for a given field + * collection. + * + * @details Note that the first update of the field will not occur + * until the next field update cycle. To force a field update cycle, + * user must call rdc_update_all_fields(1) + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The group of GPUs to be watched. + * + * @param[in] field_group_id The collection of fields to record + * + * @param[in] update_freq How often to update fields in usec. + * + * @param[in] max_keep_age How long to keep data for fields in seconds. + * + * @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_watch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples); + +/** + * @brief Request a latest cached field of a GPU + * + * @details Note that the field can be cached after called rdc_watch_fields + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] gpu_index The GPU index. + * + * @param[in] field The field id + * + * @param[out] value The field value got from cache. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_latest_value_for_field(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, rdc_field_value* value); + +/** + * @brief Request a history cached field of a GPU + * + * @details Note that the field can be cached after called rdc_watch_fields + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] gpu_index The GPU index. + * + * @param[in] field The field id + * + * @param[in] since_time_stamp Timestamp to request values since in + * usec since 1970. + * + * @param[out] next_since_time_stamp Timestamp to use for sinceTimestamp + * on next call to this function + * + * @param[out] value The field value got from cache. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_field_value_since(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value); + +/** + * @brief Stop record updates for a given field collection. + * + * @details The cache of those fields will not be updated after this call + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] field_group_id The field group id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_unwatch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id); + /** * @brief Get a description of a provided RDC error status * * @details return the string in human readable format. * - * @param[in] status The The RDC status. + * @param[in] status The RDC status. * * @retval The string to describe the RDC status. */ const char* rdc_status_string(rdc_status_t status); +/** + * @brief Get the name of a field + * + * @details return the string in human readable format. + * + * @param[in] field_id The field id. + * + * @retval The string to describe the field. + */ +const char* field_id_string(uint32_t field_id); + #endif // RDC_RDC_H_ diff --git a/projects/rdc/include/rdc_lib/RdcGroupSettings.h b/projects/rdc/include/rdc_lib/RdcGroupSettings.h new file mode 100644 index 0000000000..1c12cb37c3 --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcGroupSettings.h @@ -0,0 +1,62 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCGROUPSETTINGS_H_ +#define RDC_LIB_RDCGROUPSETTINGS_H_ + +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +class RdcGroupSettings { + public: + virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_add( + rdc_gpu_group_t groupId, uint32_t gpu_index) = 0; + virtual rdc_status_t rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) = 0; + + virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) = 0; + + virtual ~RdcGroupSettings() {} +}; + +typedef std::shared_ptr RdcGroupSettingsPtr; + + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_RDCGROUPSETTINGS_H_ diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 73a7d90d89..74db05a53f 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -31,21 +31,54 @@ namespace rdc { // Interface class class RdcHandler { public: - virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; - virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) = 0; - + // Job API virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64]) = 0; - virtual rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t groupId, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) = 0; + char job_id[64], uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) = 0; virtual rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info)= 0; virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + + // Discovery API + virtual rdc_status_t rdc_get_all_devices( + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) = 0; + virtual rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) = 0; + + // Group API + virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, + uint32_t gpu_index) = 0; + virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) = 0; + virtual rdc_status_t rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) = 0; + virtual rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) = 0; + + // Field API + virtual rdc_status_t rdc_watch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) = 0; + virtual rdc_status_t rdc_get_latest_value_for_field(uint32_t gpu_index, + uint32_t field, rdc_field_value* value) = 0; + virtual rdc_status_t rdc_get_field_value_since(uint32_t gpu_index, + uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) = 0; + virtual rdc_status_t rdc_unwatch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) = 0; + + // Control API virtual rdc_status_t rdc_update_all_fields(uint32_t wait_for_update) = 0; + virtual ~RdcHandler(){} }; diff --git a/projects/rdc/include/rdc_lib/RdcMetricFetcher.h b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h new file mode 100644 index 0000000000..3f8ad0b493 --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h @@ -0,0 +1,45 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCMETRICFETCHER_H_ +#define RDC_LIB_RDCMETRICFETCHER_H_ + +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" + + +namespace amd { +namespace rdc { + +class RdcMetricFetcher { + public: + virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) = 0; + virtual ~RdcMetricFetcher() {} +}; + +typedef std::shared_ptr RdcMetricFetcherPtr; + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_RDCMETRICFETCHER_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 9abf0f4106..72638f855c 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -23,33 +23,67 @@ THE SOFTWARE. #define RDC_LIB_IMPL_RDCEMBEDDEDHANDLER_H_ #include "rdc_lib/RdcHandler.h" - +#include "rdc_lib/RdcGroupSettings.h" +#include "rdc_lib/RdcMetricFetcher.h" namespace amd { namespace rdc { -class RdcEmbeddedHandler: public RdcHandler -{ +class RdcEmbeddedHandler: public RdcHandler { public: - rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) override; - rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) override; - + // Job API rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64]) override; - - rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t groupId, - uint64_t update_freq, double max_keep_age, + char job_id[64], uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) override; rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info) override; rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + // Discovery API + rdc_status_t rdc_get_all_devices( + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override; + rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) override; + + // Group API + rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, + uint32_t gpu_index) override; + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) override; + + // Field API + rdc_status_t rdc_watch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) override; + rdc_status_t rdc_get_latest_value_for_field(uint32_t gpu_index, + uint32_t field, rdc_field_value* value) override; + rdc_status_t rdc_get_field_value_since(uint32_t gpu_index, + uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) override; + rdc_status_t rdc_unwatch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) override; + + // Control API rdc_status_t rdc_update_all_fields(uint32_t wait_for_update) override; explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); + + private: + RdcGroupSettingsPtr group_settings_; + RdcMetricFetcherPtr metric_fetcher_; }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h new file mode 100644 index 0000000000..5470bb38f8 --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h @@ -0,0 +1,71 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ +#define RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ + + +#include +#include +#include +#include +#include "rdc_lib/RdcGroupSettings.h" + +namespace amd { +namespace rdc { + +class RdcGroupSettingsImpl: public RdcGroupSettings { + public: + rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, + uint32_t gpu_index) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + + RdcGroupSettingsImpl(); + + private: + std::map gpu_group_; + std::map field_group_; + uint32_t cur_group_id_ = 0; + uint32_t cur_filed_group_id_ = 0; + std::mutex group_mutex_; + std::mutex field_group_mutex_; + +}; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h new file mode 100644 index 0000000000..d82c06f9b9 --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -0,0 +1,39 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ +#define RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ + +#include "rdc_lib/RdcMetricFetcher.h" + +namespace amd { +namespace rdc { + +class RdcMetricFetcherImpl: public RdcMetricFetcher { + public: + rdc_status_t fetch_smi_field(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) override; +}; + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/rdc_common.h b/projects/rdc/include/rdc_lib/rdc_common.h index 9960e536da..ca7b905dc8 100644 --- a/projects/rdc/include/rdc_lib/rdc_common.h +++ b/projects/rdc/include/rdc_lib/rdc_common.h @@ -31,5 +31,21 @@ THE SOFTWARE. #define LOG_DEBUG(message) #endif +/** + * @brief The strncpy but with null terminated + * + * @details It will copy at most n-1 bytes from src to dst, and + * always adds a null terminator following the bytes copied to dst. + * + * @param[out] dest The destination string to copy + * + * @param[in] src The source string to be copied + * + * @param[in] n At most n-1 bytes will be copied + * + * @retval Return a pointer to the destination string. + */ +char *strncpy_with_null(char *dest, const char *src, size_t n); + #endif // RDC_LIB_RDC_COMMON_H_ diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index cf23fb8068..ce16dcc9fa 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -137,8 +137,15 @@ set_property(TARGET ${BOOTSTRAP_LIB} PROPERTY # librdc.so set up set(RDC_LIB "rdc") set(RDC_LIB_COMPONENT "lib${RDC_LIB}") -set(RDC_LIB_SRC_LIST "${SRC_DIR}/rdc/src/RdcEmbeddedHandler.cc") -set(RDC_LIB_INC_LIST "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcEmbeddedHandler.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcMetricFetcherImpl.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcGroupSettingsImpl.cc") + +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMetricFetcherImpl.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcGroupSettings.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcGroupSettingsImpl.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") link_directories(${RSMI_LIB_DIR}) diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 6f0afa3025..1cbb97d21a 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include +#include #include "rdc/rdc.h" #include "rdc_lib/RdcHandler.h" #include "rdc_lib/rdc_common.h" @@ -117,19 +119,6 @@ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle) { return RDC_ST_OK; } - -rdc_status_t rdc_watch_job_fields(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } - - return static_cast(p_rdc_handle)-> - rdc_watch_job_fields(group_id, update_freq, - max_keep_age, max_keep_samples); -} - rdc_status_t rdc_update_all_fields(rdc_handle_t p_rdc_handle, uint32_t wait_for_update) { if (!p_rdc_handle) { @@ -151,13 +140,15 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64] , } rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t groupId, char job_id[64] ) { + rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples ) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle)-> - rdc_job_start_stats(groupId, job_id); + rdc_job_start_stats(groupId, job_id, update_freq, + max_keep_age, max_keep_samples); } @@ -191,6 +182,124 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_group_gpu_add(groupId, gpuIndex); } +rdc_status_t rdc_get_all_devices(rdc_handle_t p_rdc_handle, + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { + if (!p_rdc_handle || !count) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_all_devices(gpu_index_list, count); +} + +rdc_status_t rdc_get_device_attributes(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_handle || !p_rdc_attr) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_device_attributes(gpu_index, p_rdc_attr); +} + +rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, + uint32_t num_field_ids, uint32_t* field_ids, + const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { + if (!p_rdc_handle || !field_ids || + !field_group_name || !rdc_field_group_id) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_field_create(num_field_ids, field_ids, + field_group_name, rdc_field_group_id); +} + +rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_field_get_info(rdc_field_group_id, field_group_info); +} + +rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_handle || !p_rdc_group_info) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info); +} + +rdc_status_t rdc_watch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_watch_fields(group_id, field_group_id, update_freq, + max_keep_age, max_keep_samples); +} + +rdc_status_t rdc_get_latest_value_for_field(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, rdc_field_value* value) { + if (!p_rdc_handle || !value) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_latest_value_for_field(gpu_index, field, value); +} + +rdc_status_t rdc_get_field_value_since(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) { + if (!p_rdc_handle || !next_since_time_stamp || !value) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_field_value_since(gpu_index, field, since_time_stamp, + next_since_time_stamp, value); +} + +rdc_status_t rdc_unwatch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_unwatch_fields(group_id, field_group_id); +} + +rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_gpu_destroy(p_rdc_group_id); +} + +rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_field_destroy(rdc_field_group_id); +} + const char* rdc_status_string(rdc_status_t result) { switch (result) { case RDC_ST_OK: @@ -201,8 +310,44 @@ const char* rdc_status_string(rdc_status_t result) { return "Fail to load module"; case RDC_ST_INVALID_HANDLER: return "Invalid handler"; + case RDC_ST_NOT_FOUND: + return "Cannot find the value"; + case RDC_ST_BAD_PARAMETER: + return "Invalid parameters"; + case RDC_ST_MSI_ERROR: + return "SMI error"; + case RDC_ST_MAX_LIMIT: + return "The max limit reached"; default: return "Unknown"; } } +const char* field_id_string(uint32_t field_id) { + const std::map id_name = { + {RDC_FI_GPU_MEMORY_USAGE, "GPU_MEMORY_USAGE"}, + {RDC_FI_GPU_MEMORY_TOTAL, "GPU_MEMORY_TOTAL"}, + {RDC_FI_POWER_USAGE, "POWER_USAGE"}, + {RDC_FI_GPU_SM_CLOCK, "GPU_SM_CLOCK"}, + {RDC_FI_GPU_UTIL, "GPU_UTIL"}, + {RDC_FI_GPU_TEMP, "GPU_TEMP"}, + {RDC_FI_GPU_COUNT, "GPU_COUNT"}, + {RDC_FI_DEV_NAME, "DEV_NAME"} + }; + + auto search = id_name.find(field_id); + if (search == id_name.end()) { + return "UNKNOWN_FIELD"; + } + + return search->second; +} + +char *strncpy_with_null(char *dest, const char *src, size_t n) { + if (n == 0) { + return dest; + } + strncpy(dest, src, n - 1); + dest[n - 1]= '\0'; + return dest; +} diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 7850b35f00..7e6cff9248 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -20,6 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcEmbeddedHandler.h" +#include +#include "rdc_lib/impl/RdcMetricFetcherImpl.h" +#include "rdc_lib/impl/RdcGroupSettingsImpl.h" #include "rdc_lib/rdc_common.h" #include "rocm_smi/rocm_smi.h" @@ -47,51 +50,188 @@ namespace amd { namespace rdc { -RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t) { - // TODO(next_step): implement +RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) : + group_settings_(new RdcGroupSettingsImpl()) + , metric_fetcher_(new RdcMetricFetcherImpl()) { + // TODO(bill_liu): implement the operation mode + (void)(mode); } -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) { - // TODO(next_step): implement - return RDC_ST_OK; -} +// JOB API +rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, + char job_id[64], uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) { + // TODO(bill_liu): implement + (void)(groupId); + (void)(job_id); + (void)(update_freq); + (void)(max_keep_age); + (void)(max_keep_samples); -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, - uint32_t gpu_index ) { - // TODO(next_step): implement - return RDC_ST_OK; -} - - -rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats( - rdc_gpu_group_t groupId, char job_id[64]) { - // TODO(next_step): implement - return RDC_ST_OK; -} - -rdc_status_t RdcEmbeddedHandler::rdc_watch_job_fields(rdc_gpu_group_t groupId, - uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) { - // TODO(next_step): implement return RDC_ST_OK; } rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], rdc_job_info_t* p_job_info) { - // TODO(next_step): implement - LOG_DEBUG("RdcEmbeddedHandler::rdc_job_get_stats:" << job_id); + // TODO(bill_liu): implement + (void)(job_id); + (void)(p_job_info); return RDC_ST_OK; } rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64] ) { - // TODO(next_step): implement + // TODO(bill_liu): implement + (void)(job_id); return RDC_ST_OK; } + +// Discovery API +rdc_status_t RdcEmbeddedHandler::rdc_get_all_devices( + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_count; + rdc_status_t status = metric_fetcher_-> + fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count); + if (status != RDC_ST_OK) { + return status; + } + + // Assign the index to the index list + *count = device_count.value.l_int; + for (uint32_t i=0; i < *count; i++) { + gpu_index_list[i] = i; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcEmbeddedHandler::rdc_get_device_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_attr) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_name; + rdc_status_t status = metric_fetcher_-> + fetch_smi_field(gpu_index, RDC_FI_DEV_NAME, &device_name); + strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, + RDC_MAX_STR_LENGTH); + return status; +} + + + // Group API +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) { + if (!group_name || !p_rdc_group_id) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_-> + rdc_group_gpu_create(type, group_name, p_rdc_group_id); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, + uint32_t gpu_index) { + return group_settings_->rdc_group_gpu_add(group_id, gpu_index); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) { + if (!field_group_name || !rdc_field_group_id || !field_ids) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_field_create( + num_field_ids, field_ids, field_group_name, rdc_field_group_id); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + if (!field_group_info) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_field_get_info( + rdc_field_group_id, field_group_info); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_group_info) { + return RDC_ST_BAD_PARAMETER; + } + + return group_settings_->rdc_group_gpu_get_info( + p_rdc_group_id, p_rdc_group_info); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) { + return group_settings_->rdc_group_gpu_destroy(p_rdc_group_id); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) { + return group_settings_->rdc_group_field_destroy(rdc_field_group_id); +} + +// Field API +rdc_status_t RdcEmbeddedHandler::rdc_watch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) { + // TODO(bill_liu): implement + (void)(group_id); + (void)(field_group_id); + (void)(update_freq); + (void)(max_keep_age); + (void)(max_keep_samples); + return RDC_ST_OK; +} + +rdc_status_t RdcEmbeddedHandler::rdc_get_latest_value_for_field( + uint32_t gpu_index, uint32_t field, rdc_field_value* value) { + // TODO(bill_liu): implement + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + (void)(gpu_index); + (void)(field); + return RDC_ST_NOT_FOUND; +} + +rdc_status_t RdcEmbeddedHandler::rdc_get_field_value_since(uint32_t gpu_index, + uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) { + // TODO(bill_liu): implement + if (!next_since_time_stamp || !value) { + return RDC_ST_BAD_PARAMETER; + } + (void)(since_time_stamp); + (void)(gpu_index); + (void)(field); + (void)(value); + + return RDC_ST_NOT_FOUND; +} + +rdc_status_t RdcEmbeddedHandler::rdc_unwatch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) { + // TODO(bill_liu): implement + (void)(group_id); + (void)(field_group_id); + return RDC_ST_OK; +} + + +// Control API rdc_status_t RdcEmbeddedHandler::rdc_update_all_fields( uint32_t wait_for_update) { - // TODO(next_step): implement - LOG_DEBUG("RdcEmbeddedHandler::rdc_update_all_fields"); + // TODO(bill_liu): implement + (void)(wait_for_update); return RDC_ST_OK; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc new file mode 100644 index 0000000000..f95960a528 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -0,0 +1,152 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcGroupSettingsImpl.h" +#include +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcGroupSettingsImpl::RdcGroupSettingsImpl() { +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) +{ + // TODO(bill_liu): handle type to create default group for all GPUs + if (type == RDC_GROUP_DEFAULT) { + return RDC_ST_NOT_SUPPORTED; + } + + rdc_group_info_t ginfo; + strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH); + + std::lock_guard guard(group_mutex_); + gpu_group_.emplace(cur_group_id_, ginfo); + *p_rdc_group_id = cur_group_id_; + cur_group_id_++; + + return RDC_ST_OK; +} + + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) { + std::lock_guard guard(group_mutex_); + gpu_group_.erase(p_rdc_group_id); + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add( + rdc_gpu_group_t groupId, uint32_t gpu_index ) { + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.find(groupId); + if (ite != gpu_group_.end()) { + // Check whether the index already exists + for (uint32_t i=0; i < ite->second.count; i++) { + if (ite->second.entity_ids[i] == gpu_index) { + return RDC_ST_BAD_PARAMETER; + } + } + if (ite->second.count < RDC_GROUP_MAX_ENTITIES) { + ite->second.entity_ids[ite->second.count] = gpu_index; + ite->second.count++; + } else { + return RDC_ST_MAX_LIMIT; + } + } + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) { + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.find(p_rdc_group_id); + if (ite != gpu_group_.end()) { + auto info = ite->second; + strncpy_with_null(p_rdc_group_info->group_name, + info.group_name, RDC_MAX_STR_LENGTH); + p_rdc_group_info->count = info.count; + for (uint32_t i=0 ; i < info.count; i++) { + p_rdc_group_info->entity_ids[i]= info.entity_ids[i]; + } + } else { + return RDC_ST_NOT_FOUND; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create( + uint32_t num_field_ids, uint32_t* field_ids, + const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { + + rdc_field_group_info_t finfo; + finfo.count = num_field_ids; + strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH); + if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { + for (uint32_t i = 0; i < num_field_ids; i++) { + finfo.field_ids[i] = field_ids[i]; + } + } else { + return RDC_ST_MAX_LIMIT; + } + + std::lock_guard guard(field_group_mutex_); + field_group_.emplace(cur_filed_group_id_, finfo); + *rdc_field_group_id = cur_filed_group_id_; + cur_filed_group_id_++; + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) { + std::lock_guard guard(field_group_mutex_); + field_group_.erase(rdc_field_group_id); + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + + std::lock_guard guard(field_group_mutex_); + auto ite = field_group_.find(rdc_field_group_id); + if (ite != field_group_.end()) { + auto info = ite->second; + strncpy_with_null(field_group_info->group_name, info.group_name, + RDC_MAX_STR_LENGTH); + field_group_info->count = info.count; + for (uint32_t i=0 ; i < info.count; i++) { + field_group_info->field_ids[i]= info.field_ids[i]; + } + } else { + return RDC_ST_NOT_FOUND; + } + return RDC_ST_OK; +} + + +} // namespace rdc +} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc new file mode 100644 index 0000000000..aa24bf43b1 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -0,0 +1,110 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcMetricFetcherImpl.h" +#include +#include +#include +#include +#include "rdc_lib/rdc_common.h" +#include "rocm_smi/rocm_smi.h" + +namespace amd { +namespace rdc { + +rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + uint64_t i64 = 0; + + struct timeval tv; + gettimeofday(&tv, NULL); + value->ts = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + value->field_id = field_id; + value->status = RSMI_STATUS_NOT_SUPPORTED; + + switch (field_id) { + case RDC_FI_GPU_MEMORY_USAGE: + value->status = rsmi_dev_memory_usage_get(gpu_index, + RSMI_MEM_TYPE_VRAM, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_MEMORY_TOTAL: + value->status = rsmi_dev_memory_total_get(gpu_index, + RSMI_MEM_TYPE_VRAM, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_COUNT: + uint32_t num_gpu; + value->status = rsmi_num_monitor_devices(&num_gpu); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(num_gpu); + } + break; + case RDC_FI_POWER_USAGE: + value->status = rsmi_dev_power_ave_get(gpu_index, + RSMI_TEMP_CURRENT, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_SM_CLOCK: + rsmi_frequencies_t f; + value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, + RSMI_CLK_TYPE_SYS, &f); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = f.frequency[f.current]; + } + break; + case RDC_FI_GPU_UTIL: + uint32_t busy_percent; + value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(busy_percent); + } + break; + case RDC_FI_DEV_NAME: + value->status = rsmi_dev_name_get(gpu_index, + value->value.str, RDC_MAX_STR_LENGTH); + value->type = STRING; + break; + default: + break; + } + + return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; +} + + +} // namespace rdc +} // namespace amd