diff --git a/projects/rdc/example/CMakeLists.txt b/projects/rdc/example/CMakeLists.txt index 0f2445aa20..1ffda1b83f 100755 --- a/projects/rdc/example/CMakeLists.txt +++ b/projects/rdc/example/CMakeLists.txt @@ -76,6 +76,16 @@ add_executable(${JOBSTATS_EXAMPLE_EXE} "${JOBSTATS_EXAMPLE_SRC_LIST}") target_link_libraries(${JOBSTATS_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(FIELDVALUE_EXAMPLE_SRC_LIST "${SRC_DIR}/field_value_example.cc") +message("FIELDVALUE_EXAMPLE_SRC_LIST=${FIELDVALUE_EXAMPLE_SRC_LIST}") +set(FIELDVALUE_EXAMPLE_EXE "fieldvalue") + +link_directories(${LIB_BOOSTRAP_DIR}) + +add_executable(${FIELDVALUE_EXAMPLE_EXE} "${FIELDVALUE_EXAMPLE_SRC_LIST}") + +target_link_libraries(${FIELDVALUE_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") diff --git a/projects/rdc/example/field_value_example.cc b/projects/rdc/example/field_value_example.cc new file mode 100644 index 0000000000..1fb7db9b33 --- /dev/null +++ b/projects/rdc/example/field_value_example.cc @@ -0,0 +1,276 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include "rdc/rdc.h" + +int main(int, char **) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"group1"}; + char field_group_name[] = {"fieldgroup1"}; + uint64_t since_timestamp = 0; + uint64_t next_timestamp = 0; + uint64_t start_timestamp = 0; + uint32_t count = 0; + + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone? + "Standalone mode selected.\n":"Embedded mode selected.\n"); + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << + rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle); + if ( result != RDC_ST_OK ) { + std::cout << "Error connecting to remote rdcd. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // Get the list of devices in the system + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + result = rdc_get_all_devices(rdc_handle, gpu_index_list, &count); + if (result != RDC_ST_OK) { + std::cout << "Error to find devices on the system. Return: " + << rdc_status_string(result); + goto cleanup; + } + if (count == 0) { + std::cout << "No GPUs find on the sytem "; + goto cleanup; + } else { + std::cout << count << " GPUs found in the system.\n"; + } + + // Create the group + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, + group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Created the GPU group " << group_id << std::endl; + + // Add all GPUs to the group + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, + group_id, gpu_index_list[i]); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " + << rdc_status_string(result); + goto cleanup; + } + rdc_device_attributes_t attribute; + result = rdc_get_device_attributes(rdc_handle, + gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Add GPU " <(time(nullptr)-10)*1000; + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < field_info.count; findex++) { + since_timestamp = start_timestamp; + while (true) { + rdc_field_value value; + result = rdc_get_field_value_since(rdc_handle, + group_info.entity_ids[gindex] , field_info.field_ids[findex], + since_timestamp, &next_timestamp, &value); + if (result == RDC_ST_NOT_FOUND) { + break; + } + if (result != RDC_ST_OK) { + std::cout << "Error get history data. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << value.ts <<"\t" << group_info.entity_ids[gindex] + << "\t\t" << std::left << std::setw(16) + << field_id_string(value.field_id) << "\t" + << value.value.l_int << std::endl; + since_timestamp = next_timestamp; + } // while + } // for findex + } // for gindex + + // Delete the field group and GPU group + result = rdc_group_field_destroy(rdc_handle, field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete field group. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the field group " << field_group_id << std::endl; + + result = rdc_group_gpu_destroy(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete GPU group. Return: " + << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the GPU group " << group_id << std::endl; + + + // Cleanup consists of shutting down RDC. + cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} diff --git a/projects/rdc/example/job_stats_example.cc b/projects/rdc/example/job_stats_example.cc index 8aab3f039f..a4a5135c96 100644 --- a/projects/rdc/example/job_stats_example.cc +++ b/projects/rdc/example/job_stats_example.cc @@ -94,15 +94,8 @@ int main(int, char **) { // (2) start the recording. Set the sample frequency to once per second, the // max keep age to one hour and the maximum number of samples to // keep to unlimited. - result = rdc_watch_job_fields(rdc_handle, group_id, 1000000, 3600, 0); - if (result != RDC_ST_OK) { - std::cout << "Error watch job fileds. Return: " - << rdc_status_string(result); - goto cleanup; - } - - // (3) Start a Slurm job on this group - result = rdc_job_start_stats(rdc_handle, group_id, job_id); + result = rdc_job_start_stats(rdc_handle, group_id, + job_id, 1000000, 3600, 0); if (result != RDC_ST_OK) { std::cout << "Error start job stats. Return: " << rdc_status_string(result); @@ -126,7 +119,7 @@ int main(int, char **) { usleep(5000000); // sleep 5 seconds before fetch the stats } - // (4) stop the Slurm job, which will stop the watch + // (3) stop the Slurm job, which will stop the watch // We do not have to stop the job to get stats. The rdc_job_get_stats can be // called at any time before stop result = rdc_job_stop_stats(rdc_handle, job_id); @@ -136,7 +129,7 @@ int main(int, char **) { goto cleanup; } - // (5) Get the stats + // (4) Get the stats rdc_job_info_t job_info; result = rdc_job_get_stats(rdc_handle, job_id, &job_info); @@ -178,7 +171,7 @@ int main(int, char **) { std::cout << "No data for job stats found." << std::endl; } - // Cleanup consists of shutting down DCGM. + // Cleanup consists of shutting down RDC. cleanup: std::cout << "Cleaning up.\n"; if (standalone) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index f1d1a73f09..3b54e7abe8 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -31,30 +31,33 @@ THE SOFTWARE. * in this file. * * @brief The rocm_rdc library api is new, and therefore subject to change - * either at the ABI or API level. Instead of marking every function prototype as "unstable", - * we areinstead saying the API is unstable (i.e., changes are possible) while the - * major version remains 0. This means that if the API/ABI changes, we will - * not increment the major version to 1. Once the ABI stabilizes, we will - * increment the major version to 1, and thereafter increment it on all ABI - * breaks. + * either at the ABI or API level. Instead of marking every function prototype + * as "unstable", we areinstead saying the API is unstable (i.e., changes + * are possible) while the major version remains 0. This means that if the + * API/ABI changes, we will not increment the major version to 1. Once the + * ABI stabilizes, we will increment the major version to 1, and thereafter + * increment it on all ABI breaks. */ /** - * @brief Error codes retured by rocm_rdc_lib functions + * @brief Error codes returned by rocm_rdc_lib functions */ typedef enum { RDC_ST_OK = 0, RDC_ST_NOT_SUPPORTED, //!< Not supported feature RDC_ST_MSI_ERROR, //!< The MSI library error RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library - RDC_ST_INVALID_HANDLER //!< Fail to load the library + RDC_ST_INVALID_HANDLER, //!< Invalid handler + RDC_ST_BAD_PARAMETER, //!< A parameter is invalid + RDC_ST_NOT_FOUND, //!< Cannot find the value + RDC_ST_MAX_LIMIT //!< Max limit recording for the object } rdc_status_t; /** * @brief rdc operation mode * rdc can run in auto mode where background threads will collect metrics. - * When run in manual mode, the user needs to periodically call rdc_update_all_fields - * for data collection. + * When run in manual mode, the user needs to periodically call + * rdc_update_all_fields for data collection. */ typedef enum { RDC_OPERATION_MODE_AUTO = 0, @@ -94,6 +97,11 @@ typedef enum { */ #define RDC_GROUP_MAX_ENTITIES 64 +/** + * @brief Max number of GPUs supported by RDC + */ +#define RDC_MAX_NUM_DEVICES 16 + /** * @brief The max fields in a field group */ @@ -134,6 +142,17 @@ typedef enum { */ #define RDC_FI_GPU_TEMP 150 +/** + * GPU count in the system + */ +#define RDC_FI_GPU_COUNT 4 + +/** + * Name of the device + */ +#define RDC_FI_DEV_NAME 50 + + /** * @brief handlers used in various rdc calls */ @@ -141,6 +160,25 @@ typedef void *rdc_handle_t; typedef uint32_t rdc_gpu_group_t; typedef uint32_t rdc_field_grp_t; +/** + * @brief Represents attributes corresponding to a device + */ +typedef struct { + char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device. +} rdc_device_attributes_t; + +/** + * @brief The structure to store the group info + */ +typedef struct { + unsigned int count; //!< count of GPUs in the group + char group_name[RDC_MAX_STR_LENGTH]; //!< group name + /** + * The list of entities in the group + */ + uint32_t entity_ids[RDC_GROUP_MAX_ENTITIES]; +} rdc_group_info_t; + /** * @brief The structure to store summary of data */ @@ -154,7 +192,7 @@ typedef struct { * @brief The structure to hold the GPU usage information */ typedef struct { - uint32_t gpu_id; //!< GPU_ID_INVALID for summary information + uint32_t gpu_id; //!< GPU_ID_INVALID for summary information uint64_t start_time; //!< The time to start the watching uint64_t end_time; //!< The time to stop the watching @@ -180,7 +218,7 @@ typedef struct { * @brief The structure to store the field value */ typedef struct { - uint16_t field_id; //!< The field id of the value + uint32_t field_id; //!< The field id of the value int status; //!< RDC_ST_OK or error status uint64_t ts; //!< Timestamp in usec since 1970 rdc_field_type_t type; //!< The field type @@ -238,9 +276,9 @@ rdc_status_t rdc_shutdown(); * @brief Start embedded RDC agent within this process. * * @details The RDC is loaded as library so that it does not require rdcd - * daemon. In this mode, the user has to periodically call rdc_update_all_fields() - * when op_mode is RDC_OPERATION_MODE_MANUAL, which tells RDC to collect - * the stats. This function is not thread safe. + * daemon. In this mode, the user has to periodically call + * rdc_update_all_fields() when op_mode is RDC_OPERATION_MODE_MANUAL, which + * tells RDC to collect the stats. This function is not thread safe. * * @param[in] op_mode Operation modes. When RDC_OPERATION_MODE_AUTO, RDC schedules * background task to collect the stats. When RDC_OPERATION_MODE_MANUAL, the user @@ -257,10 +295,11 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, /** * @brief Stop embedded RDC agent. * - * @details Stop the embedded RDC agent, and p_rdc_handle becomes invalid after - * this call. This function is not thread safe. + * @details Stop the embedded RDC agent, and p_rdc_handle becomes + * invalid after this call. This function is not thread safe. * - * @param[in] p_rdc_handle The RDC handler that come from rdc_start_embedded(). + * @param[in] p_rdc_handle The RDC handler that come from + * rdc_start_embedded(). * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle); @@ -271,12 +310,13 @@ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle); * @details This method is used to connect to a remote stand-alone rdcd daemon. * This function is not thread safe. * - * @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort can be - * specified in this x.x.x.x:yyyy format, where x.x.x.x is the IP address and - * yyyy is the port. + * @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort + * can be specified in this x.x.x.x:yyyy format, where x.x.x.x is the + * IP address and yyyy is the port. * * @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon - * successful call, the value will contain the handler for following API calls. + * successful call, the value will contain the handler + * for following API calls. * * @retval ::RDC_ST_OK is returned upon successful call. */ @@ -285,8 +325,8 @@ rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle); /** * @brief Disconnect from rdcd daemon. * - * @details Disconnect from rdcd daemon, and p_rdc_handle becomes invalid after - * this call. This function is not thread safe. + * @details Disconnect from rdcd daemon, and p_rdc_handle becomes invalid + * after this call. This function is not thread safe. * * @param[in] p_rdc_handle The RDC handler that come from rdc_connect(). * @@ -294,57 +334,20 @@ rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle); */ rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle); -/** - * @brief Create a group contains multiple GPUs - * - * @details This method can create a group contains multiple GPUs. Instead of - * executing an operation separately for each GPU, the RDC group enables - * the user to execute same operation on all the GPUs present in the group as a - * single API call. - * - * @param[in] p_rdc_handle The RDC handler. - * - * @param[in] type The type of the group. RDC_GROUP_DEFAULT includes all the GPUs - * on the node, and RDC_GROUP_EMPTY creates an empty group. - * - * @param[in] group_name The group name specified as NULL terminated C String - * - * @param[inout] p_rdc_group_id Caller provided pointer to rdc_gpu_group_t. Upon - * successful call, the value will contain the group id for following group API calls. - * - * @retval ::RDC_ST_OK is returned upon successful call. - */ -rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, - rdc_group_type_t type, const char* group_name, - rdc_gpu_group_t* p_rdc_group_id); - -/** - * @brief Add a GPU to the group - * - * @details This method can add a GPU to the group - * - * @param[in] p_rdc_handle The The RDC handler. - * - * @param[in] group_id The group id to which the GPU will be added. - * - * @param[in] gpu_index The GPU index to be added to the group. - * - * @retval ::RDC_ST_OK is returned upon successful call. - */ -rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, uint32_t gpu_index); - /** * @brief Request the RDC to watch the job stats * - * @details The summary job stats can be retrieved using rdc_job_get_stats() + * @details This should be executed as part of job prologue. The summary + * job stats can be retrieved using rdc_job_get_stats(). * In RDC_OPERATION_MODE_MANUAL, user must call rdc_update_all_fields(1) * at least once, before call rdc_job_get_stats() * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] group_id The group of GPUs to be watched. * + * @param[in] job_id The name of the job. + * * @param[in] update_freq How often to update this field in usec. * * @param[in] max_keep_age How long to keep data for this field in seconds. @@ -353,30 +356,17 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, * * @retval ::RDC_ST_OK is returned upon successful call. */ -rdc_status_t rdc_watch_job_fields(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples); - -/** - * @brief Request RDC a job to be started - * - * @details This should be execute as part of job prologue - * - * @param[in] p_rdc_handle The The RDC handler. - * - * @param[in] job_id The name of the job. - * - * @retval ::RDC_ST_OK is returned upon successful call. - */ rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, char job_id[64]); + rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples); /** * @brief Get the stats of the job using the job id. * - * @details The stats can be retrieved at any point when the job is in process. + * @details The stats can be retrieved at any point when the job is in + * process. * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] job_id The name of the job. * @@ -391,11 +381,11 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64], /** * @brief Request RDC to stop watching the stats of the job * - * @details This should be execute as part of job epilogue. The job Id remains - * available to view the stats at any point. You must call rdc_watch_job_fields() - * before this call. + * @details This should be execute as part of job epilogue. The job Id + * remains available to view the stats at any point. You must call + * rdc_watch_job_fields() before this call. * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] job_id The name of the job. * @@ -407,9 +397,10 @@ rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, /** * @brief Request RDC to update all fields to be watched. * - * @details In RDC_OPERATION_MODE_MANUAL, the user must call this method periodically. + * @details In RDC_OPERATION_MODE_MANUAL, the user must call this method + * periodically. * - * @param[in] p_rdc_handle The The RDC handler. + * @param[in] p_rdc_handle The RDC handler. * * @param[in] wait_for_update Whether or not to wait for the update loop to * complete before returning to the caller 1=wait. 0=do not wait. @@ -419,15 +410,279 @@ rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, rdc_status_t rdc_update_all_fields(rdc_handle_t p_rdc_handle, uint32_t wait_for_update); +/** + * @brief Get indexes corresponding to all the devices on the system. + * + * @details Indexes represents RDC GPU Id corresponding to each GPU on the + * system and is immutable during the lifespan of the engine. The list + * should be queried again if the engine is restarted. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[out] gpu_index_list Array reference to fill GPU indexes present on + * the system. + * + * @param[out] count Number of GPUs returned in gpu_index_list. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_all_devices(rdc_handle_t p_rdc_handle, + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count); + +/** + * @brief Gets device attributes corresponding to the gpu_index. + * + * @details Fetch the attributes, such as device name, of a GPU. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] gpu_index GPU index corresponding to which the attributes + * should be fetched + * + * @param[out] p_rdc_attr GPU attribute corresponding to the gpu_index. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_device_attributes(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr); + +/** + * @brief Create a group contains multiple GPUs + * + * @details This method can create a group contains multiple GPUs. Instead of + * executing an operation separately for each GPU, the RDC group enables + * the user to execute same operation on all the GPUs present in the group as + * a single API call. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] type The type of the group. RDC_GROUP_DEFAULT includes all the + * GPUs on the node, and RDC_GROUP_EMPTY creates an empty group. + * + * @param[in] group_name The group name specified as NULL terminated C String + * + * @param[inout] p_rdc_group_id Caller provided pointer to rdc_gpu_group_t. + * Upon successful call, the value will contain the group id for following + * group API calls. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, + rdc_group_type_t type, const char* group_name, + rdc_gpu_group_t* p_rdc_group_id); + +/** + * @brief Add a GPU to the group + * + * @details This method can add a GPU to the group + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The group id to which the GPU will be added. + * + * @param[in] gpu_index The GPU index to be added to the group. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, uint32_t gpu_index); + +/** + * @brief Get information about a GPU group + * + * @details Get detail information about a GPU group created by + * rdc_group_gpu_create + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] p_rdc_group_id The GPU group handler created by + * rdc_group_gpu_create + * + * @param[out] p_rdc_group_info The information of the GPU + * group p_rdc_group_id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info); + +/** + * @brief Destroy GPU group represented by p_rdc_group_id + * + * @details Delete the logic group represented by p_rdc_group_id + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] p_rdc_group_id The group id + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id); + +/** + * @brief create a group of fields + * + * @details The user can create a group of fields and perform an operation + * on a group of fields at once. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] num_field_ids Number of field IDs that are being provided + * in field_ids. + * + * @param[in] field_ids Field IDs to be added to the newly-created + * field group. + * + * @param[in] field_group_name Unique name for this group of fields. + * + * @param[out] rdc_field_group_id Handle to the newly-created field group + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, + uint32_t num_field_ids, uint32_t* field_ids, + const char* field_group_name, rdc_field_grp_t* rdc_field_group_id); + +/** + * @brief Get information about a field group + * + * @details Get detail information about a field group created by + * rdc_group_field_create + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] rdc_field_group_id The field group handler created by + * rdc_group_field_create + * + * @param[out] field_group_info The information of the field group + * rdc_field_group_id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info); + +/** + * @brief Destroy field group represented by rdc_field_group_id + * + * @details Delete the logic group represented by rdc_field_group_id + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] rdc_field_group_id The field group id + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id); + +/** + * @brief Request the RDC start recording updates for a given field + * collection. + * + * @details Note that the first update of the field will not occur + * until the next field update cycle. To force a field update cycle, + * user must call rdc_update_all_fields(1) + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The group of GPUs to be watched. + * + * @param[in] field_group_id The collection of fields to record + * + * @param[in] update_freq How often to update fields in usec. + * + * @param[in] max_keep_age How long to keep data for fields in seconds. + * + * @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_watch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples); + +/** + * @brief Request a latest cached field of a GPU + * + * @details Note that the field can be cached after called rdc_watch_fields + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] gpu_index The GPU index. + * + * @param[in] field The field id + * + * @param[out] value The field value got from cache. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_latest_value_for_field(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, rdc_field_value* value); + +/** + * @brief Request a history cached field of a GPU + * + * @details Note that the field can be cached after called rdc_watch_fields + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] gpu_index The GPU index. + * + * @param[in] field The field id + * + * @param[in] since_time_stamp Timestamp to request values since in + * usec since 1970. + * + * @param[out] next_since_time_stamp Timestamp to use for sinceTimestamp + * on next call to this function + * + * @param[out] value The field value got from cache. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_get_field_value_since(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value); + +/** + * @brief Stop record updates for a given field collection. + * + * @details The cache of those fields will not be updated after this call + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] field_group_id The field group id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_unwatch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id); + /** * @brief Get a description of a provided RDC error status * * @details return the string in human readable format. * - * @param[in] status The The RDC status. + * @param[in] status The RDC status. * * @retval The string to describe the RDC status. */ const char* rdc_status_string(rdc_status_t status); +/** + * @brief Get the name of a field + * + * @details return the string in human readable format. + * + * @param[in] field_id The field id. + * + * @retval The string to describe the field. + */ +const char* field_id_string(uint32_t field_id); + #endif // RDC_RDC_H_ diff --git a/projects/rdc/include/rdc_lib/RdcGroupSettings.h b/projects/rdc/include/rdc_lib/RdcGroupSettings.h new file mode 100644 index 0000000000..1c12cb37c3 --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcGroupSettings.h @@ -0,0 +1,62 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCGROUPSETTINGS_H_ +#define RDC_LIB_RDCGROUPSETTINGS_H_ + +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +class RdcGroupSettings { + public: + virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_add( + rdc_gpu_group_t groupId, uint32_t gpu_index) = 0; + virtual rdc_status_t rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) = 0; + + virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) = 0; + + virtual ~RdcGroupSettings() {} +}; + +typedef std::shared_ptr RdcGroupSettingsPtr; + + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_RDCGROUPSETTINGS_H_ diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 73a7d90d89..74db05a53f 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -31,21 +31,54 @@ namespace rdc { // Interface class class RdcHandler { public: - virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; - virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) = 0; - + // Job API virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64]) = 0; - virtual rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t groupId, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) = 0; + char job_id[64], uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) = 0; virtual rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info)= 0; virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + + // Discovery API + virtual rdc_status_t rdc_get_all_devices( + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) = 0; + virtual rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) = 0; + + // Group API + virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, + uint32_t gpu_index) = 0; + virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) = 0; + virtual rdc_status_t rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) = 0; + virtual rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) = 0; + + // Field API + virtual rdc_status_t rdc_watch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) = 0; + virtual rdc_status_t rdc_get_latest_value_for_field(uint32_t gpu_index, + uint32_t field, rdc_field_value* value) = 0; + virtual rdc_status_t rdc_get_field_value_since(uint32_t gpu_index, + uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) = 0; + virtual rdc_status_t rdc_unwatch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) = 0; + + // Control API virtual rdc_status_t rdc_update_all_fields(uint32_t wait_for_update) = 0; + virtual ~RdcHandler(){} }; diff --git a/projects/rdc/include/rdc_lib/RdcMetricFetcher.h b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h new file mode 100644 index 0000000000..3f8ad0b493 --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h @@ -0,0 +1,45 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCMETRICFETCHER_H_ +#define RDC_LIB_RDCMETRICFETCHER_H_ + +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" + + +namespace amd { +namespace rdc { + +class RdcMetricFetcher { + public: + virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) = 0; + virtual ~RdcMetricFetcher() {} +}; + +typedef std::shared_ptr RdcMetricFetcherPtr; + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_RDCMETRICFETCHER_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 9abf0f4106..72638f855c 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -23,33 +23,67 @@ THE SOFTWARE. #define RDC_LIB_IMPL_RDCEMBEDDEDHANDLER_H_ #include "rdc_lib/RdcHandler.h" - +#include "rdc_lib/RdcGroupSettings.h" +#include "rdc_lib/RdcMetricFetcher.h" namespace amd { namespace rdc { -class RdcEmbeddedHandler: public RdcHandler -{ +class RdcEmbeddedHandler: public RdcHandler { public: - rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) override; - rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) override; - + // Job API rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64]) override; - - rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t groupId, - uint64_t update_freq, double max_keep_age, + char job_id[64], uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) override; rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info) override; rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + // Discovery API + rdc_status_t rdc_get_all_devices( + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override; + rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) override; + + // Group API + rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, + uint32_t gpu_index) override; + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) override; + + // Field API + rdc_status_t rdc_watch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) override; + rdc_status_t rdc_get_latest_value_for_field(uint32_t gpu_index, + uint32_t field, rdc_field_value* value) override; + rdc_status_t rdc_get_field_value_since(uint32_t gpu_index, + uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) override; + rdc_status_t rdc_unwatch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) override; + + // Control API rdc_status_t rdc_update_all_fields(uint32_t wait_for_update) override; explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); + + private: + RdcGroupSettingsPtr group_settings_; + RdcMetricFetcherPtr metric_fetcher_; }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h new file mode 100644 index 0000000000..5470bb38f8 --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h @@ -0,0 +1,71 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ +#define RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ + + +#include +#include +#include +#include +#include "rdc_lib/RdcGroupSettings.h" + +namespace amd { +namespace rdc { + +class RdcGroupSettingsImpl: public RdcGroupSettings { + public: + rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, + uint32_t gpu_index) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + + RdcGroupSettingsImpl(); + + private: + std::map gpu_group_; + std::map field_group_; + uint32_t cur_group_id_ = 0; + uint32_t cur_filed_group_id_ = 0; + std::mutex group_mutex_; + std::mutex field_group_mutex_; + +}; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h new file mode 100644 index 0000000000..d82c06f9b9 --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -0,0 +1,39 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ +#define RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ + +#include "rdc_lib/RdcMetricFetcher.h" + +namespace amd { +namespace rdc { + +class RdcMetricFetcherImpl: public RdcMetricFetcher { + public: + rdc_status_t fetch_smi_field(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) override; +}; + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/rdc_common.h b/projects/rdc/include/rdc_lib/rdc_common.h index 9960e536da..ca7b905dc8 100644 --- a/projects/rdc/include/rdc_lib/rdc_common.h +++ b/projects/rdc/include/rdc_lib/rdc_common.h @@ -31,5 +31,21 @@ THE SOFTWARE. #define LOG_DEBUG(message) #endif +/** + * @brief The strncpy but with null terminated + * + * @details It will copy at most n-1 bytes from src to dst, and + * always adds a null terminator following the bytes copied to dst. + * + * @param[out] dest The destination string to copy + * + * @param[in] src The source string to be copied + * + * @param[in] n At most n-1 bytes will be copied + * + * @retval Return a pointer to the destination string. + */ +char *strncpy_with_null(char *dest, const char *src, size_t n); + #endif // RDC_LIB_RDC_COMMON_H_ diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index cf23fb8068..ce16dcc9fa 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -137,8 +137,15 @@ set_property(TARGET ${BOOTSTRAP_LIB} PROPERTY # librdc.so set up set(RDC_LIB "rdc") set(RDC_LIB_COMPONENT "lib${RDC_LIB}") -set(RDC_LIB_SRC_LIST "${SRC_DIR}/rdc/src/RdcEmbeddedHandler.cc") -set(RDC_LIB_INC_LIST "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcEmbeddedHandler.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcMetricFetcherImpl.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcGroupSettingsImpl.cc") + +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMetricFetcherImpl.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcGroupSettings.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcGroupSettingsImpl.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") link_directories(${RSMI_LIB_DIR}) diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 6f0afa3025..1cbb97d21a 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include +#include #include "rdc/rdc.h" #include "rdc_lib/RdcHandler.h" #include "rdc_lib/rdc_common.h" @@ -117,19 +119,6 @@ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle) { return RDC_ST_OK; } - -rdc_status_t rdc_watch_job_fields(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } - - return static_cast(p_rdc_handle)-> - rdc_watch_job_fields(group_id, update_freq, - max_keep_age, max_keep_samples); -} - rdc_status_t rdc_update_all_fields(rdc_handle_t p_rdc_handle, uint32_t wait_for_update) { if (!p_rdc_handle) { @@ -151,13 +140,15 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64] , } rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t groupId, char job_id[64] ) { + rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples ) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle)-> - rdc_job_start_stats(groupId, job_id); + rdc_job_start_stats(groupId, job_id, update_freq, + max_keep_age, max_keep_samples); } @@ -191,6 +182,124 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_group_gpu_add(groupId, gpuIndex); } +rdc_status_t rdc_get_all_devices(rdc_handle_t p_rdc_handle, + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { + if (!p_rdc_handle || !count) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_all_devices(gpu_index_list, count); +} + +rdc_status_t rdc_get_device_attributes(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_handle || !p_rdc_attr) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_device_attributes(gpu_index, p_rdc_attr); +} + +rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, + uint32_t num_field_ids, uint32_t* field_ids, + const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { + if (!p_rdc_handle || !field_ids || + !field_group_name || !rdc_field_group_id) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_field_create(num_field_ids, field_ids, + field_group_name, rdc_field_group_id); +} + +rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_field_get_info(rdc_field_group_id, field_group_info); +} + +rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_handle || !p_rdc_group_info) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info); +} + +rdc_status_t rdc_watch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_watch_fields(group_id, field_group_id, update_freq, + max_keep_age, max_keep_samples); +} + +rdc_status_t rdc_get_latest_value_for_field(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, rdc_field_value* value) { + if (!p_rdc_handle || !value) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_latest_value_for_field(gpu_index, field, value); +} + +rdc_status_t rdc_get_field_value_since(rdc_handle_t p_rdc_handle, + uint32_t gpu_index, uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) { + if (!p_rdc_handle || !next_since_time_stamp || !value) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_get_field_value_since(gpu_index, field, since_time_stamp, + next_since_time_stamp, value); +} + +rdc_status_t rdc_unwatch_fields(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_unwatch_fields(group_id, field_group_id); +} + +rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t p_rdc_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_gpu_destroy(p_rdc_group_id); +} + +rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, + rdc_field_grp_t rdc_field_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_group_field_destroy(rdc_field_group_id); +} + const char* rdc_status_string(rdc_status_t result) { switch (result) { case RDC_ST_OK: @@ -201,8 +310,44 @@ const char* rdc_status_string(rdc_status_t result) { return "Fail to load module"; case RDC_ST_INVALID_HANDLER: return "Invalid handler"; + case RDC_ST_NOT_FOUND: + return "Cannot find the value"; + case RDC_ST_BAD_PARAMETER: + return "Invalid parameters"; + case RDC_ST_MSI_ERROR: + return "SMI error"; + case RDC_ST_MAX_LIMIT: + return "The max limit reached"; default: return "Unknown"; } } +const char* field_id_string(uint32_t field_id) { + const std::map id_name = { + {RDC_FI_GPU_MEMORY_USAGE, "GPU_MEMORY_USAGE"}, + {RDC_FI_GPU_MEMORY_TOTAL, "GPU_MEMORY_TOTAL"}, + {RDC_FI_POWER_USAGE, "POWER_USAGE"}, + {RDC_FI_GPU_SM_CLOCK, "GPU_SM_CLOCK"}, + {RDC_FI_GPU_UTIL, "GPU_UTIL"}, + {RDC_FI_GPU_TEMP, "GPU_TEMP"}, + {RDC_FI_GPU_COUNT, "GPU_COUNT"}, + {RDC_FI_DEV_NAME, "DEV_NAME"} + }; + + auto search = id_name.find(field_id); + if (search == id_name.end()) { + return "UNKNOWN_FIELD"; + } + + return search->second; +} + +char *strncpy_with_null(char *dest, const char *src, size_t n) { + if (n == 0) { + return dest; + } + strncpy(dest, src, n - 1); + dest[n - 1]= '\0'; + return dest; +} diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 7850b35f00..7e6cff9248 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -20,6 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcEmbeddedHandler.h" +#include +#include "rdc_lib/impl/RdcMetricFetcherImpl.h" +#include "rdc_lib/impl/RdcGroupSettingsImpl.h" #include "rdc_lib/rdc_common.h" #include "rocm_smi/rocm_smi.h" @@ -47,51 +50,188 @@ namespace amd { namespace rdc { -RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t) { - // TODO(next_step): implement +RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) : + group_settings_(new RdcGroupSettingsImpl()) + , metric_fetcher_(new RdcMetricFetcherImpl()) { + // TODO(bill_liu): implement the operation mode + (void)(mode); } -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) { - // TODO(next_step): implement - return RDC_ST_OK; -} +// JOB API +rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, + char job_id[64], uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) { + // TODO(bill_liu): implement + (void)(groupId); + (void)(job_id); + (void)(update_freq); + (void)(max_keep_age); + (void)(max_keep_samples); -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, - uint32_t gpu_index ) { - // TODO(next_step): implement - return RDC_ST_OK; -} - - -rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats( - rdc_gpu_group_t groupId, char job_id[64]) { - // TODO(next_step): implement - return RDC_ST_OK; -} - -rdc_status_t RdcEmbeddedHandler::rdc_watch_job_fields(rdc_gpu_group_t groupId, - uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) { - // TODO(next_step): implement return RDC_ST_OK; } rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], rdc_job_info_t* p_job_info) { - // TODO(next_step): implement - LOG_DEBUG("RdcEmbeddedHandler::rdc_job_get_stats:" << job_id); + // TODO(bill_liu): implement + (void)(job_id); + (void)(p_job_info); return RDC_ST_OK; } rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64] ) { - // TODO(next_step): implement + // TODO(bill_liu): implement + (void)(job_id); return RDC_ST_OK; } + +// Discovery API +rdc_status_t RdcEmbeddedHandler::rdc_get_all_devices( + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_count; + rdc_status_t status = metric_fetcher_-> + fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count); + if (status != RDC_ST_OK) { + return status; + } + + // Assign the index to the index list + *count = device_count.value.l_int; + for (uint32_t i=0; i < *count; i++) { + gpu_index_list[i] = i; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcEmbeddedHandler::rdc_get_device_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_attr) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_name; + rdc_status_t status = metric_fetcher_-> + fetch_smi_field(gpu_index, RDC_FI_DEV_NAME, &device_name); + strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, + RDC_MAX_STR_LENGTH); + return status; +} + + + // Group API +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) { + if (!group_name || !p_rdc_group_id) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_-> + rdc_group_gpu_create(type, group_name, p_rdc_group_id); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, + uint32_t gpu_index) { + return group_settings_->rdc_group_gpu_add(group_id, gpu_index); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids, + uint32_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) { + if (!field_group_name || !rdc_field_group_id || !field_ids) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_field_create( + num_field_ids, field_ids, field_group_name, rdc_field_group_id); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + if (!field_group_info) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_field_get_info( + rdc_field_group_id, field_group_info); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_group_info) { + return RDC_ST_BAD_PARAMETER; + } + + return group_settings_->rdc_group_gpu_get_info( + p_rdc_group_id, p_rdc_group_info); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) { + return group_settings_->rdc_group_gpu_destroy(p_rdc_group_id); +} + +rdc_status_t RdcEmbeddedHandler::rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) { + return group_settings_->rdc_group_field_destroy(rdc_field_group_id); +} + +// Field API +rdc_status_t RdcEmbeddedHandler::rdc_watch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) { + // TODO(bill_liu): implement + (void)(group_id); + (void)(field_group_id); + (void)(update_freq); + (void)(max_keep_age); + (void)(max_keep_samples); + return RDC_ST_OK; +} + +rdc_status_t RdcEmbeddedHandler::rdc_get_latest_value_for_field( + uint32_t gpu_index, uint32_t field, rdc_field_value* value) { + // TODO(bill_liu): implement + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + (void)(gpu_index); + (void)(field); + return RDC_ST_NOT_FOUND; +} + +rdc_status_t RdcEmbeddedHandler::rdc_get_field_value_since(uint32_t gpu_index, + uint32_t field, uint64_t since_time_stamp, + uint64_t *next_since_time_stamp, rdc_field_value* value) { + // TODO(bill_liu): implement + if (!next_since_time_stamp || !value) { + return RDC_ST_BAD_PARAMETER; + } + (void)(since_time_stamp); + (void)(gpu_index); + (void)(field); + (void)(value); + + return RDC_ST_NOT_FOUND; +} + +rdc_status_t RdcEmbeddedHandler::rdc_unwatch_fields(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) { + // TODO(bill_liu): implement + (void)(group_id); + (void)(field_group_id); + return RDC_ST_OK; +} + + +// Control API rdc_status_t RdcEmbeddedHandler::rdc_update_all_fields( uint32_t wait_for_update) { - // TODO(next_step): implement - LOG_DEBUG("RdcEmbeddedHandler::rdc_update_all_fields"); + // TODO(bill_liu): implement + (void)(wait_for_update); return RDC_ST_OK; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc new file mode 100644 index 0000000000..f95960a528 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -0,0 +1,152 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcGroupSettingsImpl.h" +#include +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcGroupSettingsImpl::RdcGroupSettingsImpl() { +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) +{ + // TODO(bill_liu): handle type to create default group for all GPUs + if (type == RDC_GROUP_DEFAULT) { + return RDC_ST_NOT_SUPPORTED; + } + + rdc_group_info_t ginfo; + strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH); + + std::lock_guard guard(group_mutex_); + gpu_group_.emplace(cur_group_id_, ginfo); + *p_rdc_group_id = cur_group_id_; + cur_group_id_++; + + return RDC_ST_OK; +} + + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy( + rdc_gpu_group_t p_rdc_group_id) { + std::lock_guard guard(group_mutex_); + gpu_group_.erase(p_rdc_group_id); + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add( + rdc_gpu_group_t groupId, uint32_t gpu_index ) { + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.find(groupId); + if (ite != gpu_group_.end()) { + // Check whether the index already exists + for (uint32_t i=0; i < ite->second.count; i++) { + if (ite->second.entity_ids[i] == gpu_index) { + return RDC_ST_BAD_PARAMETER; + } + } + if (ite->second.count < RDC_GROUP_MAX_ENTITIES) { + ite->second.entity_ids[ite->second.count] = gpu_index; + ite->second.count++; + } else { + return RDC_ST_MAX_LIMIT; + } + } + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_get_info( + rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) { + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.find(p_rdc_group_id); + if (ite != gpu_group_.end()) { + auto info = ite->second; + strncpy_with_null(p_rdc_group_info->group_name, + info.group_name, RDC_MAX_STR_LENGTH); + p_rdc_group_info->count = info.count; + for (uint32_t i=0 ; i < info.count; i++) { + p_rdc_group_info->entity_ids[i]= info.entity_ids[i]; + } + } else { + return RDC_ST_NOT_FOUND; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create( + uint32_t num_field_ids, uint32_t* field_ids, + const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { + + rdc_field_group_info_t finfo; + finfo.count = num_field_ids; + strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH); + if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { + for (uint32_t i = 0; i < num_field_ids; i++) { + finfo.field_ids[i] = field_ids[i]; + } + } else { + return RDC_ST_MAX_LIMIT; + } + + std::lock_guard guard(field_group_mutex_); + field_group_.emplace(cur_filed_group_id_, finfo); + *rdc_field_group_id = cur_filed_group_id_; + cur_filed_group_id_++; + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy( + rdc_field_grp_t rdc_field_group_id) { + std::lock_guard guard(field_group_mutex_); + field_group_.erase(rdc_field_group_id); + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_info( + rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + + std::lock_guard guard(field_group_mutex_); + auto ite = field_group_.find(rdc_field_group_id); + if (ite != field_group_.end()) { + auto info = ite->second; + strncpy_with_null(field_group_info->group_name, info.group_name, + RDC_MAX_STR_LENGTH); + field_group_info->count = info.count; + for (uint32_t i=0 ; i < info.count; i++) { + field_group_info->field_ids[i]= info.field_ids[i]; + } + } else { + return RDC_ST_NOT_FOUND; + } + return RDC_ST_OK; +} + + +} // namespace rdc +} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc new file mode 100644 index 0000000000..aa24bf43b1 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -0,0 +1,110 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcMetricFetcherImpl.h" +#include +#include +#include +#include +#include "rdc_lib/rdc_common.h" +#include "rocm_smi/rocm_smi.h" + +namespace amd { +namespace rdc { + +rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + uint64_t i64 = 0; + + struct timeval tv; + gettimeofday(&tv, NULL); + value->ts = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + value->field_id = field_id; + value->status = RSMI_STATUS_NOT_SUPPORTED; + + switch (field_id) { + case RDC_FI_GPU_MEMORY_USAGE: + value->status = rsmi_dev_memory_usage_get(gpu_index, + RSMI_MEM_TYPE_VRAM, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_MEMORY_TOTAL: + value->status = rsmi_dev_memory_total_get(gpu_index, + RSMI_MEM_TYPE_VRAM, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_COUNT: + uint32_t num_gpu; + value->status = rsmi_num_monitor_devices(&num_gpu); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(num_gpu); + } + break; + case RDC_FI_POWER_USAGE: + value->status = rsmi_dev_power_ave_get(gpu_index, + RSMI_TEMP_CURRENT, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_SM_CLOCK: + rsmi_frequencies_t f; + value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, + RSMI_CLK_TYPE_SYS, &f); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = f.frequency[f.current]; + } + break; + case RDC_FI_GPU_UTIL: + uint32_t busy_percent; + value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(busy_percent); + } + break; + case RDC_FI_DEV_NAME: + value->status = rsmi_dev_name_get(gpu_index, + value->value.str, RDC_MAX_STR_LENGTH); + value->type = STRING; + break; + default: + break; + } + + return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; +} + + +} // namespace rdc +} // namespace amd