diff --git a/projects/rdc/.clang-format b/projects/rdc/.clang-format new file mode 100644 index 0000000000..f506d11e61 --- /dev/null +++ b/projects/rdc/.clang-format @@ -0,0 +1,9 @@ +--- +Language: Cpp +BasedOnStyle: Google +ColumnLimit: 100 + +# Force pointers to the type for C++. +# For some reason Google style doesn't specify this.. +DerivePointerAlignment: false +PointerAlignment: Left diff --git a/projects/rdc/.editorconfig b/projects/rdc/.editorconfig index ad8d825fc2..e1c63b8bb7 100644 --- a/projects/rdc/.editorconfig +++ b/projects/rdc/.editorconfig @@ -13,7 +13,7 @@ end_of_line = lf [*.{c,cc,cpp,h,hh,hpp}] charset = utf-8 indent_style = space -indent_size = 4 +indent_size = 2 [*.py] indent_style = space diff --git a/projects/rdc/.gitignore b/projects/rdc/.gitignore index d693107b87..10db303ca8 100644 --- a/projects/rdc/.gitignore +++ b/projects/rdc/.gitignore @@ -17,3 +17,7 @@ docs/_doxygen/ # VisualStudioCode .vscode/ + +# do NOT ignore these files +!.clang-format +!.editorconfig diff --git a/projects/rdc/.pre-commit-config.yaml b/projects/rdc/.pre-commit-config.yaml new file mode 100644 index 0000000000..d84939508a --- /dev/null +++ b/projects/rdc/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +# - How to use: +# python3 -m pip install pre-commit +# pre-commit install --install hooks +# Upon a new commit - the hooks should automagically run +# +# - How to skip: +# git commit --no-verify +# or +# SKIP=clang-format-docker git commit +# SKIP=cpplint-docker git commit + +fail_fast: false +repos: + # For portability I decided to use Docker containers + - repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint + rev: 0.0.3 + hooks: + - id: clang-format-docker + - id: cpplint-docker + # Below is a local way of running formatters and linters + # NOTE: clang-tidy is not used in the above tests + # - repo: https://github.com/pocc/pre-commit-hooks + # rev: v1.3.5 + # hooks: + # - id: clang-format + # args: [--no-diff, -i] + # - id: clang-tidy + # args: [-p=build, --quiet] + # - id: cpplint + # args: [--verbose=5] diff --git a/projects/rdc/CPPLINT.cfg b/projects/rdc/CPPLINT.cfg new file mode 100644 index 0000000000..b63692c6df --- /dev/null +++ b/projects/rdc/CPPLINT.cfg @@ -0,0 +1,3 @@ +set noparent +linelength=100 +filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard diff --git a/projects/rdc/client/include/rdc/rdc_client.h b/projects/rdc/client/include/rdc/rdc_client.h old mode 100755 new mode 100644 index 3c37d7f984..8813f2c43f --- a/projects/rdc/client/include/rdc/rdc_client.h +++ b/projects/rdc/client/include/rdc/rdc_client.h @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include + #include "rocm_smi/rocm_smi.h" /** @@ -190,7 +191,6 @@ typedef enum { RDC_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred } rdc_status_t; - /** * @brief Handle to RDC server channel */ @@ -232,10 +232,8 @@ typedef uintptr_t rdc_channel_t; * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. * */ -rdc_status_t -rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, - grpc_connectivity_state *state); - +rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, + grpc_connectivity_state* state); /** * @brief Verify a channel's connection to the server @@ -252,8 +250,7 @@ rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. * */ -rdc_status_t -rdc_channel_connection_verify(rdc_channel_t channel); +rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel); /** @} */ // end of RDCAdmin @@ -267,7 +264,7 @@ rdc_channel_connection_verify(rdc_channel_t channel); /** * @brief Create a communications channel to an RDC server * - * @details Given a pointer to an ::rdc_channel_t @p channel, a string + * @details Given a pointer to an ::rdc_channel_t @p channel, a string * containing the ip address of the server @p ip, a string containing * the port number on which the server is listening @p port and a bool * indicating whether the channel should use a secure link @p secure, @@ -290,9 +287,8 @@ rdc_channel_connection_verify(rdc_channel_t channel); * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. * */ -rdc_status_t -rdc_channel_create(rdc_channel_t *channel, const char *ip, const char *port, - bool secure); +rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port, + bool secure); /** * @brief Destroy a communications channel to an RDC server @@ -305,13 +301,12 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip, const char *port, * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. * */ -rdc_status_t -rdc_channel_destroy(rdc_channel_t channel); +rdc_status_t rdc_channel_destroy(rdc_channel_t channel); /** @} */ // end of InitShutAdmin /*****************************************************************************/ -/** @defgroup RSMIAccess Remote ROCm SMI Calls +/** @defgroup RSMIAccess Remote ROCm SMI Calls * These functions calls make ROCm SMI function calls on the remote server. * Please refer to the * [ROCm SMI documentation] @@ -319,10 +314,10 @@ rdc_channel_destroy(rdc_channel_t channel); * information about the calls. Here, we will document any additional aspects * of the calls introduced by RDC that are not covered in the ROCm SMI * documentation. - * + * * All of the functions in this section attempt to make an RSMI call on the * server machine, given an ::rdc_channel_t associated with the server, and - * all the arguments that are required to make the RSMI call. + * all the arguments that are required to make the RSMI call. * @{ */ @@ -330,12 +325,10 @@ rdc_channel_destroy(rdc_channel_t channel); * @brief Remote call to rsmi_num_monitor_devices() * */ -rdc_status_t -rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu); +rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu); /** @} */ // end of RSMIAccess - /** @defgroup PhysQuer Physical State Queries * These functions provide information about the physical characteristics of * the device. @@ -345,34 +338,29 @@ rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu); * @brief Remote call to rsmi_dev_temp_metric_get() * */ -rdc_status_t -rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_type, rsmi_temperature_metric_t metric, - int64_t *temperature); +rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type, + rsmi_temperature_metric_t metric, int64_t* temperature); /** * @brief Remote call to rsmi_dev_fan_rpms_get() * */ -rdc_status_t -rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_ind, int64_t *rpms); +rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, + int64_t* rpms); /** * @brief Remote call to rsmi_dev_fan_speed_get() * */ -rdc_status_t -rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_ind, int64_t *speed); +rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, + int64_t* speed); /** * @brief Remote call to rsmi_dev_fan_speed_max_get() * */ -rdc_status_t -rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_ind, uint64_t *max_speed); +rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, + uint64_t* max_speed); /** @} */ // end of PhysQuer /** @@ -389,7 +377,6 @@ rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call * */ -rdc_status_t -rdc_status_string(rdc_status_t status, const char **status_string); +rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string); #endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_H_ diff --git a/projects/rdc/client/include/rdc/rdc_client_main.h b/projects/rdc/client/include/rdc/rdc_client_main.h old mode 100755 new mode 100644 index 4b4413b46e..9e8a9a841a --- a/projects/rdc/client/include/rdc/rdc_client_main.h +++ b/projects/rdc/client/include/rdc/rdc_client_main.h @@ -26,8 +26,8 @@ THE SOFTWARE. #include -#include #include +#include #include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc_client.h" @@ -37,8 +37,7 @@ namespace rdc { class RDCChannel { public: - explicit RDCChannel(std::string server_ip, std::string server_port, - bool secure_channel); + explicit RDCChannel(std::string server_ip, std::string server_port, bool secure_channel); ~RDCChannel(); rdc_status_t Initialize(void); @@ -47,13 +46,12 @@ class RDCChannel { // Don't have setter for server ip and ports; we don't want to change those // after construction - std::string server_ip(void) const {return server_ip_;} - std::string server_port(void) const {return server_port_;} - bool secure_channel(void) const {return secure_channel_;} - std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const {return rsmi_stub_;} - std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { - return rdc_admin_stub_;} - std::shared_ptr const channel(void) {return channel_;} + std::string server_ip(void) const { return server_ip_; } + std::string server_port(void) const { return server_port_; } + bool secure_channel(void) const { return secure_channel_; } + std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const { return rsmi_stub_; } + std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { return rdc_admin_stub_; } + std::shared_ptr const channel(void) { return channel_; } private: std::string server_ip_; diff --git a/projects/rdc/client/include/rdc/rdc_client_utils.h b/projects/rdc/client/include/rdc/rdc_client_utils.h old mode 100755 new mode 100644 index 2bbde4c3f0..641a652caf --- a/projects/rdc/client/include/rdc/rdc_client_utils.h +++ b/projects/rdc/client/include/rdc/rdc_client_utils.h @@ -22,6 +22,7 @@ THE SOFTWARE. #ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_ #define CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_ +#include "rdc/rdc_client.h" namespace amd { namespace rdc { diff --git a/projects/rdc/client/include/rdc/rdc_exception.h b/projects/rdc/client/include/rdc/rdc_exception.h old mode 100755 new mode 100644 index 3a44555860..8d2b990332 --- a/projects/rdc/client/include/rdc/rdc_exception.h +++ b/projects/rdc/client/include/rdc/rdc_exception.h @@ -34,8 +34,8 @@ namespace rdc { /// @brief Exception type which carries an error code to return to the user. class rdc_exception : public std::exception { public: - rdc_exception(rdc_status_t error, const std::string description) : - err_(error), desc_(description) {} + rdc_exception(rdc_status_t error, const std::string description) + : err_(error), desc_(description) {} rdc_status_t error_code() const noexcept { return err_; } const char* what() const noexcept override { return desc_.c_str(); } @@ -48,4 +48,3 @@ class rdc_exception : public std::exception { } // namespace amd #endif // CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_ - diff --git a/projects/rdc/client/src/rdc_client.cc b/projects/rdc/client/src/rdc_client.cc old mode 100755 new mode 100644 index e89b2e7652..57ee7dead1 --- a/projects/rdc/client/src/rdc_client.cc +++ b/projects/rdc/client/src/rdc_client.cc @@ -20,30 +20,31 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc/rdc_client.h" +#include #include #include + #include -#include "rdc/rdc_client_main.h" -#include "rdc/rdc_client.h" -#include "rdc/rdc_client_utils.h" #include "common/rdc_utils.h" -#include "rdc/rdc_exception.h" #include "rdc.grpc.pb.h" // NOLINT +#include "rdc/rdc_client_main.h" +#include "rdc/rdc_client_utils.h" +#include "rdc/rdc_exception.h" #include "rocm_smi/rocm_smi.h" -#define CHK_PTR_ARG(PTR) \ - if ((PTR) == nullptr) { \ +#define CHK_PTR_ARG(PTR) \ + if ((PTR) == nullptr) { \ return RDC_RSMI_STATUS_INVALID_ARGS; \ } -#define UINTPTR_TO_RDC_CHAN(UPTR) \ - amd::rdc::RDCChannel *ch = reinterpret_cast(UPTR); \ - if (ch == nullptr) { \ - return RDC_STATUS_GRPC_INVALID_ARG; \ - } \ +#define UINTPTR_TO_RDC_CHAN(UPTR) \ + amd::rdc::RDCChannel* ch = reinterpret_cast(UPTR); \ + if (ch == nullptr) { \ + return RDC_STATUS_GRPC_INVALID_ARG; \ + } static rdc_status_t handleException() { try { @@ -71,13 +72,15 @@ static rdc_status_t handleException() { } #define TRY try { -#define CATCH } catch (...) {return handleException();} +#define CATCH \ + } \ + catch (...) { \ + return handleException(); \ + } -rdc_status_t -rdc_channel_create(rdc_channel_t *channel, const char *ip, - const char *port, bool secure) { - TRY - std::string server_str; +rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port, + bool secure) { + TRY std::string server_str; std::string port_str; if (channel == nullptr) { @@ -95,8 +98,7 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip, port_str = std::to_string(RDC_DEFAULT_SERVER_PORT); } - amd::rdc::RDCChannel *ch = - new amd::rdc::RDCChannel(server_str, port_str, secure); + amd::rdc::RDCChannel* ch = new amd::rdc::RDCChannel(server_str, port_str, secure); if (ch == nullptr) { return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED; @@ -115,32 +117,26 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip, CATCH } -rdc_status_t -rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, - grpc_connectivity_state *state) { - TRY - CHK_PTR_ARG(state) - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, + grpc_connectivity_state* state) { + TRY CHK_PTR_ARG(state) UINTPTR_TO_RDC_CHAN(channel) - *state = ch->channel()->GetState(try_to_connect); + * state = ch->channel()->GetState(try_to_connect); return RDC_STATUS_SUCCESS; CATCH } -rdc_status_t -rdc_channel_connection_verify(rdc_channel_t channel) { - TRY - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel) { + TRY UINTPTR_TO_RDC_CHAN(channel) - ::rdc::VerifyConnectionResponse resp; + ::rdc::VerifyConnectionResponse resp; ::rdc::VerifyConnectionRequest req; ::grpc::ClientContext context; unsigned int seed = time(NULL); req.set_magic_num(static_cast(rand_r(&seed))); - ::grpc::Status status = - ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp); + ::grpc::Status status = ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp); if (!status.ok()) { return amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -155,29 +151,23 @@ rdc_channel_connection_verify(rdc_channel_t channel) { CATCH } -rdc_status_t -rdc_channel_destroy(rdc_channel_t channel) { - TRY - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_channel_destroy(rdc_channel_t channel) { + TRY UINTPTR_TO_RDC_CHAN(channel) - delete ch; + delete ch; return RDC_STATUS_SUCCESS; CATCH } -rdc_status_t -rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu) { - TRY - CHK_PTR_ARG(num_gpu) - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu) { + TRY CHK_PTR_ARG(num_gpu) UINTPTR_TO_RDC_CHAN(channel) - ::rdc::GetNumDevicesResponse resp; + ::rdc::GetNumDevicesResponse resp; ::rdc::GetNumDevicesRequest empty; ::grpc::ClientContext context; - ::grpc::Status status = - ch->rsmi_stub()->GetNumDevices(&context, empty, &resp); + ::grpc::Status status = ch->rsmi_stub()->GetNumDevices(&context, empty, &resp); if (!status.ok()) { return amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -191,21 +181,16 @@ rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu) { // rsmi and rdc currently happen to have a 1-to-1 mapping, but // have this function in case that changes -static ::rdc::GetTemperatureRequest_TemperatureMetric - rsmi_temp2rdc_temp(rsmi_temperature_metric_t rsmi_temp) { - return - static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp); +static ::rdc::GetTemperatureRequest_TemperatureMetric rsmi_temp2rdc_temp( + rsmi_temperature_metric_t rsmi_temp) { + return static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp); } -rdc_status_t -rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_type, rsmi_temperature_metric_t metric, - int64_t *temperature) { - TRY - CHK_PTR_ARG(temperature) - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type, + rsmi_temperature_metric_t metric, int64_t* temperature) { + TRY CHK_PTR_ARG(temperature) UINTPTR_TO_RDC_CHAN(channel) - ::rdc::GetTemperatureResponse resp; + ::rdc::GetTemperatureResponse resp; ::rdc::GetTemperatureRequest in_args; ::grpc::ClientContext context; @@ -213,8 +198,7 @@ rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, in_args.set_dv_ind(dv_ind); in_args.set_sensor_type(sensor_type); - ::grpc::Status status = - ch->rsmi_stub()->GetTemperature(&context, in_args, &resp); + ::grpc::Status status = ch->rsmi_stub()->GetTemperature(&context, in_args, &resp); if (!status.ok()) { return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -226,22 +210,18 @@ rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, CATCH } -rdc_status_t -rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_ind, int64_t *rpms) { - TRY - CHK_PTR_ARG(rpms) - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, + int64_t* rpms) { + TRY CHK_PTR_ARG(rpms) UINTPTR_TO_RDC_CHAN(channel) - ::rdc::GetFanRpmsResponse resp; + ::rdc::GetFanRpmsResponse resp; ::rdc::GetFanRpmsRequest in_args; ::grpc::ClientContext context; in_args.set_dv_ind(dv_ind); in_args.set_sensor_ind(sensor_ind); - ::grpc::Status status = - ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp); + ::grpc::Status status = ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp); if (!status.ok()) { return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -253,22 +233,18 @@ rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, CATCH } -rdc_status_t -rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_ind, int64_t *speed) { - TRY - CHK_PTR_ARG(speed) - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, + int64_t* speed) { + TRY CHK_PTR_ARG(speed) UINTPTR_TO_RDC_CHAN(channel) - ::rdc::GetFanSpeedResponse resp; + ::rdc::GetFanSpeedResponse resp; ::rdc::GetFanSpeedRequest in_args; ::grpc::ClientContext context; in_args.set_dv_ind(dv_ind); in_args.set_sensor_ind(sensor_ind); - ::grpc::Status status = - ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp); + ::grpc::Status status = ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp); if (!status.ok()) { return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -280,22 +256,18 @@ rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, CATCH } -rdc_status_t -rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, - uint32_t sensor_ind, uint64_t *max_speed) { - TRY - CHK_PTR_ARG(max_speed) - UINTPTR_TO_RDC_CHAN(channel) +rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, + uint64_t* max_speed) { + TRY CHK_PTR_ARG(max_speed) UINTPTR_TO_RDC_CHAN(channel) - ::rdc::GetFanSpeedMaxResponse resp; + ::rdc::GetFanSpeedMaxResponse resp; ::rdc::GetFanSpeedMaxRequest in_args; ::grpc::ClientContext context; in_args.set_dv_ind(dv_ind); in_args.set_sensor_ind(sensor_ind); - ::grpc::Status status = - ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp); + ::grpc::Status status = ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp); if (!status.ok()) { return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -307,89 +279,97 @@ rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, CATCH } -rdc_status_t -rdc_status_string(rdc_status_t status, const char **status_string) { - TRY - if (status_string == nullptr) { - return RDC_RSMI_STATUS_INVALID_ARGS; - } +rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string) { + TRY if (status_string == nullptr) { return RDC_RSMI_STATUS_INVALID_ARGS; } const size_t status_u = static_cast(status); switch (status_u) { case RDC_STATUS_SUCCESS: - *status_string = "RDC_STATUS_SUCCESS: The function has been executed" - " successfully."; + *status_string = + "RDC_STATUS_SUCCESS: The function has been executed" + " successfully."; break; case RDC_RSMI_STATUS_INVALID_ARGS: *status_string = "RDC_RSMI_STATUS_INVALID_ARGS: The provided arguments do not" - " meet the preconditions required for calling this function."; + " meet the preconditions required for calling this function."; break; case RDC_RSMI_STATUS_NOT_SUPPORTED: - *status_string = "RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not" - " supported in the current environment."; + *status_string = + "RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not" + " supported in the current environment."; break; case RDC_RSMI_STATUS_FILE_ERROR: *status_string = - "RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or" - " opening a file or directory. The operation may not be supported by " - "this Linux kernel version."; + "RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or" + " opening a file or directory. The operation may not be supported by " + "this Linux kernel version."; break; case RDC_RSMI_STATUS_PERMISSION: - *status_string = "RDC_RSMI_STATUS_PERMISSION: The user ID of the calling" - " process does not have sufficient permission to execute a command." - " Often this is fixed by running as root (sudo)."; + *status_string = + "RDC_RSMI_STATUS_PERMISSION: The user ID of the calling" + " process does not have sufficient permission to execute a command." + " Often this is fixed by running as root (sudo)."; break; case RDC_RSMI_STATUS_OUT_OF_RESOURCES: - *status_string = "RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire " + *status_string = + "RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire " "memory or other resource"; break; case RDC_RSMI_STATUS_INTERNAL_EXCEPTION: - *status_string = "RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal " + *status_string = + "RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal " "exception was caught"; break; case RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: - *status_string = "RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided " + *status_string = + "RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided " "input is out of allowable or safe range"; break; case RDC_RSMI_STATUS_INIT_ERROR: - *status_string = "RDC_RSMI_STATUS_INIT_ERROR: An error occurred during " + *status_string = + "RDC_RSMI_STATUS_INIT_ERROR: An error occurred during " "initialization, during " - "monitor discovery or when when initializing internal data structures"; + "monitor discovery or when when initializing internal data structures"; break; case RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: - *status_string = "RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called " + *status_string = + "RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called " "function has not been implemented in this " - "system for this device type"; + "system for this device type"; break; case RDC_RSMI_STATUS_NOT_FOUND: - *status_string = "RDC_RSMI_STATUS_NOT_FOUND: An item required to " + *status_string = + "RDC_RSMI_STATUS_NOT_FOUND: An item required to " "complete the call was not found"; break; case RDC_RSMI_STATUS_INSUFFICIENT_SIZE: - *status_string = "RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough " + *status_string = + "RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough " "resources were available to fully execute" - " the call"; + " the call"; break; case RDC_RSMI_STATUS_UNKNOWN_ERROR: - *status_string = "An unknown error prevented the call from completing" - " successfully"; + *status_string = + "An unknown error prevented the call from completing" + " successfully"; break; case RDC_RSMI_STATUS_INTERRUPT: - *status_string = "RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while " + *status_string = + "RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while " "executing the function"; break; @@ -401,31 +381,31 @@ rdc_status_string(rdc_status_t status, const char **status_string) { case RDC_STATUS_GRPC_UNKNOWN: *status_string = - "RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error" - " may be returned is if a" - "Status value received from another address space belongs to an error-" - "space that is not known in this address space. Also errors raised by " - "APIs that do not return enough error information may be converted to " - "this error."; + "RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error" + " may be returned is if a" + "Status value received from another address space belongs to an error-" + "space that is not known in this address space. Also errors raised by " + "APIs that do not return enough error information may be converted to " + "this error."; break; case RDC_STATUS_GRPC_INVALID_ARG: *status_string = - "RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. " - "Note that this differs from" - "FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are " - "problematic regardless of the state of the system (e.g., a malformed " - "file name)."; + "RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. " + "Note that this differs from" + "FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are " + "problematic regardless of the state of the system (e.g., a malformed " + "file name)."; break; case RDC_STATUS_GRPC_DEADLINE_EXCEEDED: *status_string = - "RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation " - "could complete. For operations that" - "change the state of the system, this error may be returned even if " - "the operation has completed successfully. For example, a successful " - "response from a server could have been delayed long enough for the " - "deadline to expire."; + "RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation " + "could complete. For operations that" + "change the state of the system, this error may be returned even if " + "the operation has completed successfully. For example, a successful " + "response from a server could have been delayed long enough for the " + "deadline to expire."; break; case RDC_STATUS_GRPC_NOT_FOUND: @@ -436,130 +416,129 @@ rdc_status_string(rdc_status_t status, const char **status_string) { case RDC_STATUS_GRPC_ALREADY_EXISTS: *status_string = - "RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we attempted to create " - "(e.g., file or directory) already exists."; + "RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we " + "attempted to create " + "(e.g., file or directory) already exists."; break; case RDC_STATUS_GRPC_PERM_DENIED: *status_string = - "RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to " - "execute the specified operation." - "PERMISSION_DENIED must not be used for rejections caused by " - "exhausting some resource (use RESOURCE_EXHAUSTED instead for those " - "errors). PERMISSION_DENIED must not be used if the caller can not " - " be identified (use UNAUTHENTICATED instead for those errors)."; + "RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to " + "execute the specified operation." + "PERMISSION_DENIED must not be used for rejections caused by " + "exhausting some resource (use RESOURCE_EXHAUSTED instead for those " + "errors). PERMISSION_DENIED must not be used if the caller can not " + " be identified (use UNAUTHENTICATED instead for those errors)."; break; case RDC_STATUS_GRPC_UNAUTHENTICATED: *status_string = - "RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid " - "authentication credentials for the operation."; + "RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid " + "authentication credentials for the operation."; break; case RDC_STATUS_GRPC_RESOURCE_EXHAUSTED: *status_string = - "RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, " - "perhaps a per-user quota, or perhaps the " - "entire file system is out of space."; + "RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, " + "perhaps a per-user quota, or perhaps the " + "entire file system is out of space."; break; case RDC_STATUS_GRPC_FAILED_PRECOND: *status_string = - "RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the " - "system is not in a state required for " - "the operation's execution. For example, directory to be deleted may " - "be non-empty, an rmdir operation is applied to a non-directory, etc.\n" - "A litmus test that may help a service implementor in deciding " - "between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n" - " (a) Use UNAVAILABLE if the client can retry just the failing call.\n" - " (b) Use ABORTED if the client should retry at a higher-level " - " (e.g., restarting a read-modify-write sequence).\n" - " (c) Use FAILED_PRECONDITION if the client should not retry until" - " the system state has been explicitly fixed. E.g., if an \"rmdir\"" - " fails because the directory is non-empty, FAILED_PRECONDITION" - " should be returned since the client should not retry unless" - " they have first fixed up the directory by deleting files from it.\n" - " (d) Use FAILED_PRECONDITION if the client performs conditional" - " REST Get/Update/Delete on a resource and the resource on the" - " server does not match the condition. E.g., conflicting" - " read-modify-write on the same resource."; + "RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the " + "system is not in a state required for " + "the operation's execution. For example, directory to be deleted may " + "be non-empty, an rmdir operation is applied to a non-directory, etc.\n" + "A litmus test that may help a service implementor in deciding " + "between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n" + " (a) Use UNAVAILABLE if the client can retry just the failing call.\n" + " (b) Use ABORTED if the client should retry at a higher-level " + " (e.g., restarting a read-modify-write sequence).\n" + " (c) Use FAILED_PRECONDITION if the client should not retry until" + " the system state has been explicitly fixed. E.g., if an \"rmdir\"" + " fails because the directory is non-empty, FAILED_PRECONDITION" + " should be returned since the client should not retry unless" + " they have first fixed up the directory by deleting files from it.\n" + " (d) Use FAILED_PRECONDITION if the client performs conditional" + " REST Get/Update/Delete on a resource and the resource on the" + " server does not match the condition. E.g., conflicting" + " read-modify-write on the same resource."; break; case RDC_STATUS_GRPC_ABORTED: *status_string = - "RDC_STATUS_GRPC_ABORTED The operation was aborted, " - "typically due to a concurrency issue like " - "sequencer check failures, transaction aborts, etc.\n" - "See litmus test above for deciding between " - "FAILED_PRECONDITION, ABORTED, " - "and UNAVAILABLE."; + "RDC_STATUS_GRPC_ABORTED The operation was aborted, " + "typically due to a concurrency issue like " + "sequencer check failures, transaction aborts, etc.\n" + "See litmus test above for deciding between " + "FAILED_PRECONDITION, ABORTED, " + "and UNAVAILABLE."; break; case RDC_STATUS_GRPC_OUT_OF_RANGE: *status_string = - "RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted " - "past the valid range. E.g., seeking or reading " - "past end of file.\n" - "Unlike INVALID_ARGUMENT, this error indicates a " - "problem that may be fixed " - "if the system state changes. For example, a 32-bit file system will " - "generate INVALID_ARGUMENT if asked to read " - "at an offset that is not in the " - "range [0,2^32-1], but it will generate " - "OUT_OF_RANGE if asked to read from " - "an offset past the current file size.\n" - "There is a fair bit of overlap between FAILED_PRECONDITION and " - "OUT_OF_RANGE. We recommend using OUT_OF_RANGE " - "(the more specific error) " - "when it applies so that callers who are " - "iterating through a space can " - "easily look for an OUT_OF_RANGE error to detect when they are done."; + "RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted " + "past the valid range. E.g., seeking or reading " + "past end of file.\n" + "Unlike INVALID_ARGUMENT, this error indicates a " + "problem that may be fixed " + "if the system state changes. For example, a 32-bit file system will " + "generate INVALID_ARGUMENT if asked to read " + "at an offset that is not in the " + "range [0,2^32-1], but it will generate " + "OUT_OF_RANGE if asked to read from " + "an offset past the current file size.\n" + "There is a fair bit of overlap between FAILED_PRECONDITION and " + "OUT_OF_RANGE. We recommend using OUT_OF_RANGE " + "(the more specific error) " + "when it applies so that callers who are " + "iterating through a space can " + "easily look for an OUT_OF_RANGE error to detect when they are done."; break; case RDC_STATUS_GRPC_UNIMPLEMENTED: *status_string = - "RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not " - "implemented or not supported/enabled in this service."; + "RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not " + "implemented or not supported/enabled in this service."; break; case RDC_STATUS_GRPC_INTERNAL: *status_string = - "RDC_STATUS_GRPC_INTERNAL Internal errors. This means " - "some invariants expected by underlying System has " - "been broken. If you see one of these errors."; + "RDC_STATUS_GRPC_INTERNAL Internal errors. This means " + "some invariants expected by underlying System has " + "been broken. If you see one of these errors."; break; case RDC_STATUS_GRPC_UNAVAILABLE: *status_string = - "RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. " - "This is a most likely a transient " - "condition and may be corrected by retrying with a backoff.\n" - "Warning: Although data MIGHT not have been transmitted when this " - "status occurs, there is NOT A GUARANTEE that the server has not seen " - "anything. So in general it is unsafe to retry on this status code " - "if the call is non-idempotent. " - "See litmus test above for deciding between " - "FAILED_PRECONDITION, ABORTED," - "and UNAVAILABLE."; + "RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. " + "This is a most likely a transient " + "condition and may be corrected by retrying with a backoff.\n" + "Warning: Although data MIGHT not have been transmitted when this " + "status occurs, there is NOT A GUARANTEE that the server has not seen " + "anything. So in general it is unsafe to retry on this status code " + "if the call is non-idempotent. " + "See litmus test above for deciding between " + "FAILED_PRECONDITION, ABORTED," + "and UNAVAILABLE."; break; case RDC_STATUS_GRPC_DATA_LOSS: - *status_string = - "RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption."; + *status_string = "RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption."; break; case RDC_STATUS_UNKNOWN_ERROR: - *status_string = - "RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred."; + *status_string = "RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred."; break; case RDC_STATUS_CLIENT_ERR_SSL: - *status_string = - "An error occurred when executing SSL authentication operations."; + *status_string = "An error occurred when executing SSL authentication operations."; break; default: - *status_string = "RDC_RSMI_STATUS_UNKNOWN_ERROR An " + *status_string = + "RDC_RSMI_STATUS_UNKNOWN_ERROR An " "unknown error occurred"; return RDC_RSMI_STATUS_UNKNOWN_ERROR; } diff --git a/projects/rdc/client/src/rdc_client_main.cc b/projects/rdc/client/src/rdc_client_main.cc old mode 100755 new mode 100644 index 520678ffc1..978bacaa2c --- a/projects/rdc/client/src/rdc_client_main.cc +++ b/projects/rdc/client/src/rdc_client_main.cc @@ -21,46 +21,39 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc/rdc_client_main.h" + #include #include #include -#include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_client_main.h" -#include "rdc/rdc_client.h" #include "common/rdc_utils.h" +#include "rdc.grpc.pb.h" // NOLINT +#include "rdc/rdc_client.h" namespace amd { namespace rdc { #ifdef USE_PINNED_CERTS // Pinned certificates -static const char *kDefaultRDCServerCertPinPath = - "/etc/rdc/server/rdc_server.crt"; -static const char *kDefaultRDCClientKeyPinPath = - "/etc/rdc/client/private/rdc_client.key"; -static const char *kDefaultRDCClientCertPinPath = - "/etc/rdc/client/rdc_client.crt"; +static const char* kDefaultRDCServerCertPinPath = "/etc/rdc/server/rdc_server.crt"; +static const char* kDefaultRDCClientKeyPinPath = "/etc/rdc/client/private/rdc_client.key"; +static const char* kDefaultRDCClientCertPinPath = "/etc/rdc/client/rdc_client.crt"; #endif // USE_PINNED_CERTS // PKI certificates -static const char * kDefaultRDCClientCertKeyPkiPath = - "/etc/rdc/client/private/rdc_client_cert.key"; -static const char * kDefaultRDCClientCertPemPkiPath = - "/etc/rdc/client/certs/rdc_client_cert.pem"; -static const char * kDefaultRDCClientCACertPemPkiPath = - "/etc/rdc/client/certs/rdc_cacert.pem"; +static const char* kDefaultRDCClientCertKeyPkiPath = "/etc/rdc/client/private/rdc_client_cert.key"; +static const char* kDefaultRDCClientCertPemPkiPath = "/etc/rdc/client/certs/rdc_client_cert.pem"; +static const char* kDefaultRDCClientCACertPemPkiPath = "/etc/rdc/client/certs/rdc_cacert.pem"; -RDCChannel::RDCChannel(std::string server_ip, std::string server_port, - bool secure) : server_ip_(server_ip), server_port_(server_port), - secure_channel_(secure) {} +RDCChannel::RDCChannel(std::string server_ip, std::string server_port, bool secure) + : server_ip_(server_ip), server_port_(server_port), secure_channel_(secure) {} -RDCChannel::~RDCChannel() { -} +RDCChannel::~RDCChannel() {} #ifdef USE_PINNED_CERTS -static int ConstructSSLOptsPin(grpc::SslCredentialsOptions *ssl_opts) { +static int ConstructSSLOptsPin(grpc::SslCredentialsOptions* ssl_opts) { assert(ssl_opts != nullptr); if (ssl_opts == nullptr) { return -EINVAL; @@ -100,7 +93,7 @@ static int ConstructSSLOptsPin(grpc::SslCredentialsOptions *ssl_opts) { } #endif // USE_PINNED_CERTS -static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions *ssl_opts) { +static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions* ssl_opts) { assert(ssl_opts != nullptr); if (ssl_opts == nullptr) { return -EINVAL; @@ -139,8 +132,7 @@ static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions *ssl_opts) { return 0; } -rdc_status_t -RDCChannel::Initialize(void) { +rdc_status_t RDCChannel::Initialize(void) { assert(!server_port_.empty()); assert(!server_ip_.empty()); @@ -157,16 +149,14 @@ RDCChannel::Initialize(void) { ret = ConstructSSLOptsPKI(&ssl_opts); #endif if (ret) { - std::cerr << "Failed to process OpenSSL keys and certificates." << - std::endl; + std::cerr << "Failed to process OpenSSL keys and certificates." << std::endl; return RDC_STATUS_CLIENT_ERR_SSL; } channel_creds_ = grpc::SslCredentials(ssl_opts); channel_ = grpc::CreateChannel(addr_str, channel_creds_); } else { - channel_ = ::grpc::CreateChannel(addr_str, - grpc::InsecureChannelCredentials()); + channel_ = ::grpc::CreateChannel(addr_str, grpc::InsecureChannelCredentials()); } rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_); diff --git a/projects/rdc/client/src/rdc_client_utils.cc b/projects/rdc/client/src/rdc_client_utils.cc old mode 100755 new mode 100644 index 4ecf6c8fe3..151ab1aa1c --- a/projects/rdc/client/src/rdc_client_utils.cc +++ b/projects/rdc/client/src/rdc_client_utils.cc @@ -20,17 +20,17 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "rdc/rdc_client.h" -#include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc_client_utils.h" +#include "rdc.grpc.pb.h" // NOLINT +#include "rdc/rdc_client.h" + namespace amd { namespace rdc { rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) { uint32_t grpc_err_int = static_cast(grpc_err); - uint32_t rdc_grpc_base_int = - static_cast(RDC_STATUS_GRPC_ERR_FIRST); + uint32_t rdc_grpc_base_int = static_cast(RDC_STATUS_GRPC_ERR_FIRST); uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int; return static_cast(rdc_err_int); diff --git a/projects/rdc/common/rdc_capabilities.cc b/projects/rdc/common/rdc_capabilities.cc old mode 100755 new mode 100644 index 59caff17b0..db41302288 --- a/projects/rdc/common/rdc_capabilities.cc +++ b/projects/rdc/common/rdc_capabilities.cc @@ -20,16 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include - #include "common/rdc_capabilities.h" +#include +#include +#include + namespace amd { namespace rdc { -int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled) { +int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool* enabled) { cap_t caps; assert(enabled != nullptr); @@ -41,7 +41,7 @@ int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled) { // Get process's current capabilities caps = cap_get_proc(); if (caps == nullptr) { - return errno; + return errno; } cap_flag_value_t val; @@ -52,7 +52,7 @@ int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled) { } if (cap_free(caps) == -1) { - return errno; + return errno; } *enabled = (val == CAP_SET ? true : false); @@ -68,16 +68,15 @@ int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable) { // Get process's current capabilities caps = cap_get_proc(); if (caps == nullptr) { - return errno; + return errno; } // the 1 in the call below is the size of the cap_list array cap_list[0] = cap; - if (cap_set_flag(caps, cap_type, 1, cap_list, enable ? CAP_SET : CAP_CLEAR) - == -1) { + if (cap_set_flag(caps, cap_type, 1, cap_list, enable ? CAP_SET : CAP_CLEAR) == -1) { int ret = errno; cap_free(caps); - return ret; + return ret; } if (cap_set_proc(caps) == -1) { @@ -87,7 +86,7 @@ int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable) { } if (cap_free(caps) == -1) { - return errno; + return errno; } return 0; } diff --git a/projects/rdc/common/rdc_capabilities.h b/projects/rdc/common/rdc_capabilities.h old mode 100755 new mode 100644 index 30dd6e0d4e..15b8e793db --- a/projects/rdc/common/rdc_capabilities.h +++ b/projects/rdc/common/rdc_capabilities.h @@ -28,29 +28,24 @@ THE SOFTWARE. namespace amd { namespace rdc { -int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled); +int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool* enabled); int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable); struct ScopedCapability { - ScopedCapability(cap_value_t cp, cap_flag_t cpt) : - cap_(cp), cap_type_(cpt), error_(0) { - error_ = ModifyCapability(cap_, cap_type_, true); - } - ~ScopedCapability() { - error_ = ModifyCapability(cap_, cap_type_, false); - } - void Relinquish(void) { - error_ = ModifyCapability(cap_, cap_type_, false); - } - int error(void) {return error_;} - private: - cap_value_t cap_; - cap_flag_t cap_type_; - int error_; + ScopedCapability(cap_value_t cp, cap_flag_t cpt) : cap_(cp), cap_type_(cpt), error_(0) { + error_ = ModifyCapability(cap_, cap_type_, true); + } + ~ScopedCapability() { error_ = ModifyCapability(cap_, cap_type_, false); } + void Relinquish(void) { error_ = ModifyCapability(cap_, cap_type_, false); } + int error(void) { return error_; } + + private: + cap_value_t cap_; + cap_flag_t cap_type_; + int error_; }; } // namespace rdc } // namespace amd #endif // COMMON_RDC_CAPABILITIES_H_ - diff --git a/projects/rdc/common/rdc_fields_supported.cc b/projects/rdc/common/rdc_fields_supported.cc index 4a2dc98de1..243dab2559 100644 --- a/projects/rdc/common/rdc_fields_supported.cc +++ b/projects/rdc/common/rdc_fields_supported.cc @@ -19,54 +19,48 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "common/rdc_fields_supported.h" + #include #include -#include "common/rdc_fields_supported.h" #include "rdc/rdc.h" namespace amd { namespace rdc { -#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) \ - {static_cast(ID), {#ID, (DESC), (LABEL), (DISPLAY)}}, +#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) \ + {static_cast(ID), {#ID, (DESC), (LABEL), (DISPLAY)}}, static const fld_id2name_map_t field_id_to_descript = { - #include "common/rdc_field.data" +#include "common/rdc_field.data" }; #undef FLD_DESC_ENT #define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) {#ID, (ID)}, static fld_name2id_map_t field_name_to_id = { - #include "common/rdc_field.data" // NOLINT +#include "common/rdc_field.data" // NOLINT }; #undef FLD_DESC_ENT +amd::rdc::fld_id2name_map_t& get_field_id_description_from_id(void) { return field_id_to_descript; } +bool get_field_id_from_name(const std::string name, rdc_field_t* value) { + assert(value != nullptr); + auto id = field_name_to_id.find(name); + if (id == field_name_to_id.end()) { + return false; + } -amd::rdc::fld_id2name_map_t & -get_field_id_description_from_id(void) { - return field_id_to_descript; -} - -bool get_field_id_from_name(const std::string name, rdc_field_t *value) { - assert(value != nullptr); - auto id = field_name_to_id.find(name); - if (id == field_name_to_id.end()) { - return false; - } - - *value = static_cast(id->second); - return true; + *value = static_cast(id->second); + return true; } bool is_field_valid(rdc_field_t field_id) { if (field_id == RDC_FI_INVALID) { return false; } - return field_id_to_descript.find(static_cast(field_id)) != - field_id_to_descript.end(); + return field_id_to_descript.find(static_cast(field_id)) != field_id_to_descript.end(); } - } // namespace rdc } // namespace amd diff --git a/projects/rdc/common/rdc_fields_supported.h b/projects/rdc/common/rdc_fields_supported.h index 90e5c31d86..6cc7f8dcd1 100644 --- a/projects/rdc/common/rdc_fields_supported.h +++ b/projects/rdc/common/rdc_fields_supported.h @@ -22,8 +22,8 @@ THE SOFTWARE. #ifndef COMMON_RDC_FIELDS_SUPPORTED_H_ #define COMMON_RDC_FIELDS_SUPPORTED_H_ -#include #include +#include #include #include "rdc/rdc.h" @@ -32,18 +32,17 @@ namespace amd { namespace rdc { typedef struct { - std::string enum_name; - std::string description; - std::string label; - bool do_display; + std::string enum_name; + std::string description; + std::string label; + bool do_display; } field_id_descript; -typedef const std::map - fld_id2name_map_t; +typedef const std::map fld_id2name_map_t; typedef std::unordered_map fld_name2id_map_t; -bool get_field_id_from_name(const std::string name, rdc_field_t *value); -fld_id2name_map_t & get_field_id_description_from_id(void); // NOLINT +bool get_field_id_from_name(const std::string name, rdc_field_t* value); +fld_id2name_map_t& get_field_id_description_from_id(void); // NOLINT bool is_field_valid(rdc_field_t field_id); } // namespace rdc diff --git a/projects/rdc/common/rdc_utils.cc b/projects/rdc/common/rdc_utils.cc old mode 100755 new mode 100644 index 4ac1786005..4650731caf --- a/projects/rdc/common/rdc_utils.cc +++ b/projects/rdc/common/rdc_utils.cc @@ -20,28 +20,28 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "common/rdc_utils.h" + +#include +#include #include #include -#include -#include -#include -#include -#include -#include #include - -#include "common/rdc_utils.h" +#include +#include +#include +#include namespace amd { namespace rdc { -bool FileExists(char const *filename) { +bool FileExists(char const* filename) { struct stat buf; return (stat(filename, &buf) == 0); } -int ReadFile(std::string path, std::string *retStr, bool chop_newline) { +int ReadFile(std::string path, std::string* retStr, bool chop_newline) { std::stringstream ss; int ret = 0; @@ -61,13 +61,12 @@ int ReadFile(std::string path, std::string *retStr, bool chop_newline) { *retStr = ss.str(); if (chop_newline) { - retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'), - retStr->end()); + retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'), retStr->end()); } return ret; } -int ReadFile(const char *path, std::string *retStr, bool chop_newline) { +int ReadFile(const char* path, std::string* retStr, bool chop_newline) { assert(path != nullptr); assert(retStr != nullptr); @@ -76,11 +75,11 @@ int ReadFile(const char *path, std::string *retStr, bool chop_newline) { return amd::rdc::ReadFile(file_path, retStr, chop_newline); } -bool IsNumber(const std::string &s) { +bool IsNumber(const std::string& s) { return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit); } -bool IsIP(const std::string &s) { +bool IsIP(const std::string& s) { struct sockaddr_in sa; int result = inet_pton(AF_INET, s.c_str(), &sa); // inet_pton returns 1 on success diff --git a/projects/rdc/common/rdc_utils.h b/projects/rdc/common/rdc_utils.h old mode 100755 new mode 100644 index 723b9dd300..3f14d7f70b --- a/projects/rdc/common/rdc_utils.h +++ b/projects/rdc/common/rdc_utils.h @@ -30,29 +30,25 @@ namespace amd { namespace rdc { #ifdef NDEBUG -#define debug_print(fmt, ...) \ - do { \ +#define debug_print(fmt, ...) \ + do { \ } while (false) #else -#define debug_print(fmt, ...) \ - do { \ - fprintf(stderr, fmt, ##__VA_ARGS__); \ +#define debug_print(fmt, ...) \ + do { \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ } while (false) #endif -bool -FileExists(char const *filename); +bool FileExists(char const* filename); -int -ReadFile(std::string path, std::string *retStr, bool chop_newline = false); -int -ReadFile(const char *path, std::string *retStr, bool chop_newline = false); +int ReadFile(std::string path, std::string* retStr, bool chop_newline = false); +int ReadFile(const char* path, std::string* retStr, bool chop_newline = false); -bool IsNumber(const std::string &s); -bool IsIP(const std::string &s); +bool IsNumber(const std::string& s); +bool IsIP(const std::string& s); } // namespace rdc } // namespace amd #endif // COMMON_RDC_UTILS_H_ - diff --git a/projects/rdc/example/diagnostic_example.cc b/projects/rdc/example/diagnostic_example.cc index 72973cc85f..34dd99c0df 100644 --- a/projects/rdc/example/diagnostic_example.cc +++ b/projects/rdc/example/diagnostic_example.cc @@ -20,154 +20,139 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include -#include +#include + #include +#include #include #include + #include "rdc/rdc.h" static std::string get_test_name(rdc_diag_test_cases_t test_case) { - const std::map test_desc = { - {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, - {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, - {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, - {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, - {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, - {RDC_DIAG_TEST_LAST, "Unknown"} - }; + const std::map test_desc = { + {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, + {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, + {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, + {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, + {RDC_DIAG_TEST_LAST, "Unknown"}}; - auto test_name = test_desc.find(test_case); - if (test_name == test_desc.end()) { - return "Unknown Test"; - } - return test_name->second; + auto test_name = test_desc.find(test_case); + if (test_name == test_desc.end()) { + return "Unknown Test"; + } + return test_name->second; } -int main(int, char **) { - rdc_status_t result; - rdc_handle_t rdc_handle; - bool standalone = false; - char hostIpAddress[] = {"127.0.0.1:50051"}; - char group_name[] = {"diag_group"}; +int main(int, char**) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"diag_group"}; - // Select the embedded mode and standalone mode dynamically. - std::cout << "Start rdci in: \n"; - std::cout << "0 - Embedded mode \n"; - std::cout << "1 - Standalone mode \n"; - while (!(std::cin >> standalone)) { - std::cout << "Invalid input.\n"; - std::cin.clear(); - std::cin.ignore(); - } - std::cout << std::endl; - std::cout << (standalone? - "Standalone mode selected.\n":"Embedded mode selected.\n"); + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n"); - // Init the rdc - result = rdc_init(0); + // Init the rdc + result = rdc_init(0); + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); if (result != RDC_ST_OK) { - std::cout << "Error initializing RDC. Return: " << - rdc_status_string(result) << std::endl; - goto cleanup; - } else { - std::cout << "RDC Initialized.\n"; + std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; } - - if (standalone) { // standalone - result = rdc_connect(hostIpAddress, &rdc_handle, - nullptr, nullptr, nullptr); - if ( result != RDC_ST_OK ) { - std::cout << "Error connecting to remote rdcd. Return: " - << rdc_status_string(result) << std::endl; - goto cleanup; - } - } else { // embedded - result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); - if (result != RDC_ST_OK) { - std::cout << "Error starting embedded RDC engine. Return: " - << rdc_status_string(result) << std::endl; - goto cleanup; - } - } - - // Now we can use the same API for both standalone and embedded - // (1) create group for all GPUs - rdc_gpu_group_t group_id; - result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT, - group_name, &group_id); + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); if (result != RDC_ST_OK) { - std::cout << "Error creating group. Return: " - << rdc_status_string(result); - goto cleanup; + std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; } + } - // (2) start to run short diagnostic. - rdc_diag_response_t response; - result = rdc_diagnostic_run(rdc_handle, group_id, - RDC_DIAG_LVL_SHORT, &response); + // Now we can use the same API for both standalone and embedded + // (1) create group for all GPUs + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result); + goto cleanup; + } - if (result != RDC_ST_OK) { - std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " - << rdc_status_string(result); - goto cleanup; - } + // (2) start to run short diagnostic. + rdc_diag_response_t response; + result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, &response); - // (3) Check diagnostic results - for (uint32_t i=0 ; i < response.results_count; i++) { - const rdc_diag_test_result_t& test_result = - response.diag_info[i]; - std::cout << std::setw(22) << std::left - << get_test_name(test_result.test_case) + ":" - << rdc_diagnostic_result_string(test_result.status) << "\n"; - } + if (result != RDC_ST_OK) { + std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " << rdc_status_string(result); + goto cleanup; + } - // (4) diagnostic detail information - std::cout <<" =============== Diagnostic Details ==================\n"; - for (uint32_t i=0 ; i < response.results_count; i++) { - const rdc_diag_test_result_t& test_result = - response.diag_info[i]; - if (test_result.info[0] != '\0') { - std::cout << std::setw(22) << std::left - << get_test_name(test_result.test_case) + ":" + // (3) Check diagnostic results + for (uint32_t i = 0; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = response.diag_info[i]; + std::cout << std::setw(22) << std::left << get_test_name(test_result.test_case) + ":" + << rdc_diagnostic_result_string(test_result.status) << "\n"; + } + + // (4) diagnostic detail information + std::cout << " =============== Diagnostic Details ==================\n"; + for (uint32_t i = 0; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = response.diag_info[i]; + if (test_result.info[0] != '\0') { + std::cout << std::setw(22) << std::left << get_test_name(test_result.test_case) + ":" << test_result.info << "\n"; - } - for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { - const rdc_diag_per_gpu_result_t& gpu_result - = test_result.gpu_results[j]; - if (strlen(gpu_result.gpu_result.msg) > 0) { - std::cout << " GPU " << gpu_result.gpu_index - << " " << gpu_result.gpu_result.msg << "\n"; - } - } } - - // (5) run one test case - std::cout <<" ============== Run individual diagnostic test ===========\n"; - rdc_diag_test_result_t test_result; - result = rdc_test_case_run(rdc_handle, group_id, - RDC_DIAG_COMPUTE_PROCESS, &test_result); - - if (result != RDC_ST_OK) { - std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: " - << rdc_status_string(result); - goto cleanup; + for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) { + const rdc_diag_per_gpu_result_t& gpu_result = test_result.gpu_results[j]; + if (strlen(gpu_result.gpu_result.msg) > 0) { + std::cout << " GPU " << gpu_result.gpu_index << " " << gpu_result.gpu_result.msg << "\n"; + } } + } - std::cout << std::setw(22) << std::left - << get_test_name(RDC_DIAG_COMPUTE_PROCESS) + ":" - << test_result.info << "\n"; + // (5) run one test case + std::cout << " ============== Run individual diagnostic test ===========\n"; + rdc_diag_test_result_t test_result; + result = rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, &test_result); + if (result != RDC_ST_OK) { + std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: " + << rdc_status_string(result); + goto cleanup; + } - // Cleanup consists of shutting down RDC. - cleanup: - std::cout << "Cleaning up.\n"; - if (standalone) - rdc_disconnect(rdc_handle); - else - rdc_stop_embedded(rdc_handle); - rdc_shutdown(); - return result; + std::cout << std::setw(22) << std::left << get_test_name(RDC_DIAG_COMPUTE_PROCESS) + ":" + << test_result.info << "\n"; + +// Cleanup consists of shutting down RDC. +cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; } diff --git a/projects/rdc/example/field_value_example.cc b/projects/rdc/example/field_value_example.cc index 8f7a1b6ae6..1d198dd7ba 100644 --- a/projects/rdc/example/field_value_example.cc +++ b/projects/rdc/example/field_value_example.cc @@ -21,257 +21,235 @@ THE SOFTWARE. */ #include -#include + #include +#include + #include "rdc/rdc.h" -int main(int, char **) { - rdc_status_t result; - rdc_handle_t rdc_handle; - bool standalone = false; - char hostIpAddress[] = {"127.0.0.1:50051"}; - char group_name[] = {"group1"}; - char field_group_name[] = {"fieldgroup1"}; - uint64_t since_timestamp = 0; - uint64_t next_timestamp = 0; - uint64_t start_timestamp = 0; - uint32_t count = 0; +int main(int, char**) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"group1"}; + char field_group_name[] = {"fieldgroup1"}; + uint64_t since_timestamp = 0; + uint64_t next_timestamp = 0; + uint64_t start_timestamp = 0; + uint32_t count = 0; + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n"); - // Select the embedded mode and standalone mode dynamically. - std::cout << "Start rdci in: \n"; - std::cout << "0 - Embedded mode \n"; - std::cout << "1 - Standalone mode \n"; - while (!(std::cin >> standalone)) { - std::cout << "Invalid input.\n"; - std::cin.clear(); - std::cin.ignore(); - } - std::cout << std::endl; - std::cout << (standalone? - "Standalone mode selected.\n":"Embedded mode selected.\n"); + // Init the rdc + result = rdc_init(0); - // Init the rdc - result = rdc_init(0); + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); if (result != RDC_ST_OK) { - std::cout << "Error initializing RDC. Return: " << - rdc_status_string(result) << std::endl; - goto cleanup; - } else { - std::cout << "RDC Initialized.\n"; + std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + } - if (standalone) { // standalone - result = rdc_connect(hostIpAddress, &rdc_handle, - nullptr, nullptr, nullptr); - if ( result != RDC_ST_OK ) { - std::cout << "Error connecting to remote rdcd. Return: " - << rdc_status_string(result) << std::endl; - goto cleanup; + // Now we can use the same API for both standalone and embedded + // Get the list of devices in the system + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); + if (result != RDC_ST_OK) { + std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result); + goto cleanup; + } + if (count == 0) { + std::cout << "No GPUs find on the sytem "; + goto cleanup; + } else { + std::cout << count << " GPUs found in the system.\n"; + } + + // Create the group + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Created the GPU group " << group_id << std::endl; + + // Add all GPUs to the group + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " << rdc_status_string(result); + goto cleanup; + } + rdc_device_attributes_t attribute; + result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group " + << group_id << std::endl; + } + + // Create the field groups to monitor POWER and TEMP + rdc_field_grp_t field_group_id; + rdc_field_t field_ids[2]; + field_ids[0] = RDC_FI_GPU_MEMORY_USAGE; + field_ids[1] = RDC_FI_POWER_USAGE; + result = rdc_group_field_create(rdc_handle, 2, &field_ids[0], field_group_name, &field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error create field group, Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Created the field group " << field_group_id << ": " + << field_id_string(RDC_FI_GPU_MEMORY_USAGE) << ", " + << field_id_string(RDC_FI_POWER_USAGE) << std::endl; + + // Let the RDC to watch the fields and groups. The fields will be updated + // once per second, the max keep age is 1 minutes and only keep 10 samples. + result = rdc_field_watch(rdc_handle, group_id, field_group_id, 1000000, 60, 10); + if (result != RDC_ST_OK) { + std::cout << "Error watch group fields. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Start to watch group:" << group_id << ", field_group:" << field_group_id + << std::endl; + std::cout << "Sleep a few seconds before retreive the data ...\n"; + + // Since we are running the RDC_OPERATION_MODE_AUTO mode, the rdc_update_ + // all_fields() will be called periodically at background. If running as + // RDC_OPERATION_MODE_MANUAL mode, we must call rdc_field_update_all() + // periodically to take samples. + usleep(5000000); // sleep 5 seconds before fetch the stats + + // Retreive the field and group information from RDC + rdc_group_info_t group_info; + rdc_field_group_info_t field_info; + result = rdc_group_gpu_get_info(rdc_handle, group_id, &group_info); + if (result != RDC_ST_OK) { + std::cout << "Error get gpu group info. Return: " << rdc_status_string(result); + goto cleanup; + } + result = rdc_group_field_get_info(rdc_handle, field_group_id, &field_info); + if (result != RDC_ST_OK) { + std::cout << "Error get field group info. Return: " << rdc_status_string(result); + goto cleanup; + } + + // Get the latest metrics + std::cout << "Get the latest metrics for group:" << group_id << " field_group:" << field_group_id + << std::endl; + std::cout << "time_stamp\t" + << "GPU_index\t" + << "field_name\t\t" + << "field_value\n"; + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < field_info.count; findex++) { + rdc_field_value value; + result = rdc_field_get_latest_value(rdc_handle, group_info.entity_ids[gindex], + field_info.field_ids[findex], &value); + if (result == RDC_ST_NOT_FOUND) { + continue; + } + if (result != RDC_ST_OK) { + std::cout << "Error get least value. Return: " << rdc_status_string(result); + goto cleanup; + } + // We only support the integer metrics so far + std::cout << value.ts << "\t" << group_info.entity_ids[gindex] << "\t\t" << std::left + << std::setw(16) << field_id_string(value.field_id) << "\t" << value.value.l_int + << std::endl; + } + } + + // Stop watching the field group + result = rdc_field_unwatch(rdc_handle, group_id, field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error stop watch fields. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Stop watch group:" << group_id << ", field_group:" << field_group_id << std::endl; + + // Get the history data last 10 seconds + std::cout << "Get last 10 seconds metrics for group:" << group_id + << " field_group:" << field_group_id << std::endl; + std::cout << "time_stamp\t" + << "GPU_index\t" + << "field_name\t\t" + << "field_value\n"; + start_timestamp = static_cast(time(nullptr) - 10) * 1000; + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < field_info.count; findex++) { + since_timestamp = start_timestamp; + while (true) { + rdc_field_value value; + result = rdc_field_get_value_since(rdc_handle, group_info.entity_ids[gindex], + field_info.field_ids[findex], since_timestamp, + &next_timestamp, &value); + if (result == RDC_ST_NOT_FOUND) { + break; } - } else { // embedded - result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); if (result != RDC_ST_OK) { - std::cout << "Error starting embedded RDC engine. Return: " - << rdc_status_string(result) << std::endl; - goto cleanup; + std::cout << "Error get history data. Return: " << rdc_status_string(result); + goto cleanup; } - } + std::cout << value.ts << "\t" << group_info.entity_ids[gindex] << "\t\t" << std::left + << std::setw(16) << field_id_string(value.field_id) << "\t" << value.value.l_int + << std::endl; + since_timestamp = next_timestamp; + } // while + } // for findex + } // for gindex - // Now we can use the same API for both standalone and embedded - // Get the list of devices in the system - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; - result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); - if (result != RDC_ST_OK) { - std::cout << "Error to find devices on the system. Return: " - << rdc_status_string(result); - goto cleanup; - } - if (count == 0) { - std::cout << "No GPUs find on the sytem "; - goto cleanup; - } else { - std::cout << count << " GPUs found in the system.\n"; - } + // Delete the field group and GPU group + result = rdc_group_field_destroy(rdc_handle, field_group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete field group. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the field group " << field_group_id << std::endl; - // Create the group - rdc_gpu_group_t group_id; - result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, - group_name, &group_id); - if (result != RDC_ST_OK) { - std::cout << "Error creating group. Return: " - << rdc_status_string(result); - goto cleanup; - } - std::cout << "Created the GPU group " << group_id << std::endl; + result = rdc_group_gpu_destroy(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete GPU group. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the GPU group " << group_id << std::endl; - // Add all GPUs to the group - for (uint32_t i = 0; i < count; i++) { - result = rdc_group_gpu_add(rdc_handle, - group_id, gpu_index_list[i]); // Add GPU 0 - if (result != RDC_ST_OK) { - std::cout << "Error adding group. Return: " - << rdc_status_string(result); - goto cleanup; - } - rdc_device_attributes_t attribute; - result = rdc_device_get_attributes(rdc_handle, - gpu_index_list[i], &attribute); - if (result != RDC_ST_OK) { - std::cout << "Error get GPU attribute. Return: " - << rdc_status_string(result); - goto cleanup; - } - std::cout << "Add GPU " <(time(nullptr)-10)*1000; - for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { - for (uint32_t findex = 0; findex < field_info.count; findex++) { - since_timestamp = start_timestamp; - while (true) { - rdc_field_value value; - result = rdc_field_get_value_since(rdc_handle, - group_info.entity_ids[gindex] , field_info.field_ids[findex], - since_timestamp, &next_timestamp, &value); - if (result == RDC_ST_NOT_FOUND) { - break; - } - if (result != RDC_ST_OK) { - std::cout << "Error get history data. Return: " - << rdc_status_string(result); - goto cleanup; - } - std::cout << value.ts <<"\t" << group_info.entity_ids[gindex] - << "\t\t" << std::left << std::setw(16) - << field_id_string(value.field_id) << "\t" - << value.value.l_int << std::endl; - since_timestamp = next_timestamp; - } // while - } // for findex - } // for gindex - - // Delete the field group and GPU group - result = rdc_group_field_destroy(rdc_handle, field_group_id); - if (result != RDC_ST_OK) { - std::cout << "Error delete field group. Return: " - << rdc_status_string(result); - goto cleanup; - } - std::cout << "Deleted the field group " << field_group_id << std::endl; - - result = rdc_group_gpu_destroy(rdc_handle, group_id); - if (result != RDC_ST_OK) { - std::cout << "Error delete GPU group. Return: " - << rdc_status_string(result); - goto cleanup; - } - std::cout << "Deleted the GPU group " << group_id << std::endl; - - - // Cleanup consists of shutting down RDC. - cleanup: - std::cout << "Cleaning up.\n"; - if (standalone) - rdc_disconnect(rdc_handle); - else - rdc_stop_embedded(rdc_handle); - rdc_shutdown(); - return result; +// Cleanup consists of shutting down RDC. +cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; } diff --git a/projects/rdc/example/job_stats_example.cc b/projects/rdc/example/job_stats_example.cc index 78d01791f5..4ccdcc9636 100644 --- a/projects/rdc/example/job_stats_example.cc +++ b/projects/rdc/example/job_stats_example.cc @@ -21,162 +21,150 @@ THE SOFTWARE. */ #include + #include + #include "rdc/rdc.h" -int main(int, char **) { - rdc_status_t result; - rdc_handle_t rdc_handle; - bool standalone = false; - char hostIpAddress[] = {"127.0.0.1:50051"}; - char group_name[] = {"group1"}; - char job_id[] = {"123"}; +int main(int, char**) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"group1"}; + char job_id[] = {"123"}; + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n"); - // Select the embedded mode and standalone mode dynamically. - std::cout << "Start rdci in: \n"; - std::cout << "0 - Embedded mode \n"; - std::cout << "1 - Standalone mode \n"; - while (!(std::cin >> standalone)) { - std::cout << "Invalid input.\n"; - std::cin.clear(); - std::cin.ignore(); - } - std::cout << std::endl; - std::cout << (standalone? - "Standalone mode selected.\n":"Embedded mode selected.\n"); + // Init the rdc + result = rdc_init(0); - // Init the rdc - result = rdc_init(0); + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); if (result != RDC_ST_OK) { - std::cout << "Error initializing RDC. Return: " << - rdc_status_string(result) << std::endl; - goto cleanup; - } else { - std::cout << "RDC Initialized.\n"; + std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; } - - if (standalone) { // standalone - result = rdc_connect(hostIpAddress, &rdc_handle, - nullptr, nullptr, nullptr); - if ( result != RDC_ST_OK ) { - std::cout << "Error connecting to remote rdcd. Return: " - << rdc_status_string(result) << std::endl; - goto cleanup; - } - } else { // embedded - result = rdc_start_embedded(RDC_OPERATION_MODE_MANUAL, &rdc_handle); - if (result != RDC_ST_OK) { - std::cout << "Error starting embedded RDC engine. Return: " - << rdc_status_string(result) << std::endl; - goto cleanup; - } - } - - // Now we can use the same API for both standalone and embedded - // (1) create group and add GPUs - rdc_gpu_group_t group_id; - result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, - group_name, &group_id); + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_MANUAL, &rdc_handle); if (result != RDC_ST_OK) { - std::cout << "Error creating group. Return: " - << rdc_status_string(result); + std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // (1) create group and add GPUs + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result); + goto cleanup; + } + + result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " << rdc_status_string(result); + goto cleanup; + } + + // (2) start the recording. Set the sample frequency to once per second. + result = rdc_job_start_stats(rdc_handle, group_id, job_id, 1000000); + if (result != RDC_ST_OK) { + std::cout << "Error start job stats. Return: " << rdc_status_string(result); + goto cleanup; + } + + // For standalone mode, the daemon will update and cache the samples + // In manual mode, we must call rdc_field_update_all periodically to + // take samples. + if (!standalone) { // embedded manual mode + for (int i = 5; i > 0; i--) { // As an example, we will take 5 samples + result = rdc_field_update_all(rdc_handle, 0); + if (result != RDC_ST_OK) { + std::cout << "Error update all fields. Return: " << rdc_status_string(result); goto cleanup; + } + usleep(1000000); } + } else { // standalone mode, do nothing + usleep(5000000); // sleep 5 seconds before fetch the stats + } - result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0 - if (result != RDC_ST_OK) { - std::cout << "Error adding group. Return: " - << rdc_status_string(result); - goto cleanup; - } + // (3) stop the Slurm job, which will stop the watch + // We do not have to stop the job to get stats. The rdc_job_get_stats can be + // called at any time before stop + result = rdc_job_stop_stats(rdc_handle, job_id); + if (result != RDC_ST_OK) { + std::cout << "Error stop job stats. Return: " << rdc_status_string(result); + goto cleanup; + } - // (2) start the recording. Set the sample frequency to once per second. - result = rdc_job_start_stats(rdc_handle, group_id, - job_id, 1000000); - if (result != RDC_ST_OK) { - std::cout << "Error start job stats. Return: " - << rdc_status_string(result); - goto cleanup; - } + // (4) Get the stats + rdc_job_info_t job_info; + result = rdc_job_get_stats(rdc_handle, job_id, &job_info); - // For standalone mode, the daemon will update and cache the samples - // In manual mode, we must call rdc_field_update_all periodically to - // take samples. - if (!standalone) { // embedded manual mode - for (int i=5; i > 0 ; i--) { // As an example, we will take 5 samples - result = rdc_field_update_all(rdc_handle, 0); - if (result != RDC_ST_OK) { - std::cout << "Error update all fields. Return: " - << rdc_status_string(result); - goto cleanup; - } - usleep(1000000); - } - } else { // standalone mode, do nothing - usleep(5000000); // sleep 5 seconds before fetch the stats - } + if (result == RDC_ST_OK) { + std::cout << "|------- Execution Stats ----------+" + << "------------------------------------\n"; + std::cout << "| Start Time * | " << job_info.summary.start_time << "\n"; + std::cout << "| End Time * | " << job_info.summary.end_time << "\n"; + std::cout << "| Total Execution Time (sec) * | " + << (job_info.summary.end_time - job_info.summary.start_time) << "\n"; + std::cout << "+------- Performance Stats --------+" + << "------------------------------------\n"; + std::cout << "| Energy Consumed (Joules) | " << job_info.summary.energy_consumed + << "\n"; + std::cout << "| Power Usage (Watts) | " + << "Max: " << job_info.summary.power_usage.max_value + << " Min: " << job_info.summary.power_usage.min_value + << " Avg: " << job_info.summary.power_usage.average << "\n"; + std::cout << "| GPU Clock (MHz) | " + << "Max: " << job_info.summary.gpu_clock.max_value + << " Min: " << job_info.summary.gpu_clock.min_value + << " Avg: " << job_info.summary.gpu_clock.average << "\n"; + std::cout << "| GPU Utilization (%) | " + << "Max: " << job_info.summary.gpu_utilization.max_value + << " Min: " << job_info.summary.gpu_utilization.min_value + << " Avg: " << job_info.summary.gpu_utilization.average << "\n"; + std::cout << "| Max GPU Memory Used (bytes) * | " << job_info.summary.max_gpu_memory_used + << "\n"; + std::cout << "| Memory Utilization (%) | " + << "Max: " << job_info.summary.memory_utilization.max_value + << " Min: " << job_info.summary.memory_utilization.min_value + << " Avg: " << job_info.summary.memory_utilization.average << "\n"; + std::cout << "+----------------------------------+" + << "------------------------------------\n"; + } else { + std::cout << "No data for job stats found." << std::endl; + } - // (3) stop the Slurm job, which will stop the watch - // We do not have to stop the job to get stats. The rdc_job_get_stats can be - // called at any time before stop - result = rdc_job_stop_stats(rdc_handle, job_id); - if (result != RDC_ST_OK) { - std::cout << "Error stop job stats. Return: " - << rdc_status_string(result); - goto cleanup; - } - - // (4) Get the stats - rdc_job_info_t job_info; - result = rdc_job_get_stats(rdc_handle, job_id, &job_info); - - if (result == RDC_ST_OK) { - std::cout << "|------- Execution Stats ----------+" - <<"------------------------------------\n"; - std::cout << "| Start Time * | " - << job_info.summary.start_time<< "\n"; - std::cout << "| End Time * | " - << job_info.summary.end_time <<"\n"; - std::cout << "| Total Execution Time (sec) * | " - << (job_info.summary.end_time-job_info.summary.start_time) - << "\n"; - std::cout << "+------- Performance Stats --------+" - << "------------------------------------\n"; - std::cout << "| Energy Consumed (Joules) | " - << job_info.summary.energy_consumed << "\n"; - std::cout << "| Power Usage (Watts) | " - << "Max: " << job_info.summary.power_usage.max_value - << " Min: "<< job_info.summary.power_usage.min_value - << " Avg: "<< job_info.summary.power_usage.average << "\n"; - std::cout << "| GPU Clock (MHz) | " - << "Max: " < -#include -#include -#include #include -#include "rdc_lib/rdc_common.h" +#include +#include +#include +#include + #include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { class RdcCacheManager { public: - virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, - rdc_field_t field, rdc_field_value* value) = 0; - virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) = 0; - virtual rdc_status_t rdc_update_cache(uint32_t gpu_index, - const rdc_field_value& value) = 0; - virtual rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id, - uint64_t max_keep_samples, double max_keep_age) = 0; - virtual std::string get_cache_stats() = 0; + virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) = 0; + virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, + uint64_t* next_since_time_stamp, + rdc_field_value* value) = 0; + virtual rdc_status_t rdc_update_cache(uint32_t gpu_index, const rdc_field_value& value) = 0; + virtual rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id, + uint64_t max_keep_samples, double max_keep_age) = 0; + virtual std::string get_cache_stats() = 0; - virtual rdc_status_t rdc_job_get_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauges, - rdc_job_info_t* p_job_info) = 0; - virtual rdc_status_t rdc_job_start_stats(const char job_id[64], - const rdc_group_info_t& group, - const rdc_field_group_info_t& finfo, - const rdc_gpu_gauges_t& gpu_gauges) = 0; - virtual rdc_status_t rdc_job_stop_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauge) = 0; - virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, - const std::string& job_id, const rdc_field_value& value) = 0; - virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; - virtual rdc_status_t rdc_job_remove_all() = 0; + virtual rdc_status_t rdc_job_get_stats(const char job_id[64], const rdc_gpu_gauges_t& gpu_gauges, + rdc_job_info_t* p_job_info) = 0; + virtual rdc_status_t rdc_job_start_stats(const char job_id[64], const rdc_group_info_t& group, + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) = 0; + virtual rdc_status_t rdc_job_stop_stats(const char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) = 0; + virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id, + const rdc_field_value& value) = 0; + virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; - virtual ~RdcCacheManager() {} + virtual ~RdcCacheManager() {} }; typedef std::shared_ptr RdcCacheManagerPtr; - } // namespace rdc } // namespace amd diff --git a/projects/rdc/include/rdc_lib/RdcDiagnostic.h b/projects/rdc/include/rdc_lib/RdcDiagnostic.h index 0f1fef4154..d915334280 100644 --- a/projects/rdc/include/rdc_lib/RdcDiagnostic.h +++ b/projects/rdc/include/rdc_lib/RdcDiagnostic.h @@ -23,6 +23,7 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ #include + #include "rdc/rdc.h" #include "rdc_lib/RdcDiagnosticLibInterface.h" @@ -31,33 +32,27 @@ namespace rdc { class RdcDiagnostic { public: - // get support test cases - virtual rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) = 0; + // get support test cases + virtual rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) = 0; - // Run a specific test case - virtual rdc_status_t rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) = 0; + // Run a specific test case + virtual rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, rdc_diag_test_result_t* result) = 0; - // Run multiple test cases - virtual rdc_status_t rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) = 0; + // Run multiple test cases + virtual rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) = 0; - virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0; - virtual rdc_status_t rdc_diag_destroy() = 0; + virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0; + virtual rdc_status_t rdc_diag_destroy() = 0; - virtual ~RdcDiagnostic() {} + virtual ~RdcDiagnostic() {} }; typedef std::shared_ptr RdcDiagnosticPtr; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ diff --git a/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h b/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h index 85d9930ccc..a9739c658f 100644 --- a/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h +++ b/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h @@ -25,29 +25,23 @@ THE SOFTWARE. // The telemetry interface for libraries, for example, RAS. #include - extern "C" { // The library will implement below function // Which test cases are supported in the library -rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count); +rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count); // Run a specific test case -rdc_status_t rdc_diag_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result); +rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result); rdc_status_t rdc_diag_init(uint64_t flags); rdc_status_t rdc_diag_destroy(); - } - -#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ +#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ diff --git a/projects/rdc/include/rdc_lib/RdcException.h b/projects/rdc/include/rdc_lib/RdcException.h index 9c6b320cfc..58f534c5c1 100644 --- a/projects/rdc/include/rdc_lib/RdcException.h +++ b/projects/rdc/include/rdc_lib/RdcException.h @@ -25,6 +25,7 @@ THE SOFTWARE. #include #include + #include "rdc/rdc.h" namespace amd { @@ -32,8 +33,8 @@ namespace rdc { class RdcException : public std::exception { public: - RdcException(rdc_status_t error, const std::string description) : - err_(error), desc_(description) {} + RdcException(rdc_status_t error, const std::string description) + : err_(error), desc_(description) {} rdc_status_t error_code() const noexcept { return err_; } const char* what() const noexcept override { return desc_.c_str(); } @@ -46,4 +47,3 @@ class RdcException : public std::exception { } // namespace amd #endif // INCLUDE_RDC_LIB_RDCEXCEPTION_H_ - diff --git a/projects/rdc/include/rdc_lib/RdcGroupSettings.h b/projects/rdc/include/rdc_lib/RdcGroupSettings.h index 4affe16b9e..e29c926226 100644 --- a/projects/rdc/include/rdc_lib/RdcGroupSettings.h +++ b/projects/rdc/include/rdc_lib/RdcGroupSettings.h @@ -23,39 +23,33 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_RDCGROUPSETTINGS_H_ #include -#include "rdc_lib/rdc_common.h" + #include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { class RdcGroupSettings { public: - virtual rdc_status_t rdc_group_gpu_create(const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) = 0; - virtual rdc_status_t rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) = 0; - virtual rdc_status_t rdc_group_gpu_add( - rdc_gpu_group_t groupId, uint32_t gpu_index) = 0; - virtual rdc_status_t rdc_group_gpu_get_info( - rdc_gpu_group_t p_rdc_group_id, - rdc_group_info_t* p_rdc_group_info) = 0; - virtual rdc_status_t rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) = 0; + virtual rdc_status_t rdc_group_gpu_create(const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) = 0; + virtual rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) = 0; + virtual rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) = 0; + virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) = 0; + virtual rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], + uint32_t* count) = 0; - virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, - rdc_field_t* field_ids, const char* field_group_name, - rdc_field_grp_t* rdc_field_group_id) = 0; - virtual rdc_status_t rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) = 0; - virtual rdc_status_t rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) = 0; - virtual rdc_status_t rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) = 0; - - virtual ~RdcGroupSettings() {} + virtual ~RdcGroupSettings() {} }; typedef std::shared_ptr RdcGroupSettingsPtr; diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 8cda0c0531..0e76740327 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -22,8 +22,8 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_RDCHANDLER_H_ #define INCLUDE_RDC_LIB_RDCHANDLER_H_ -#include "rdc_lib/rdc_common.h" #include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { @@ -31,70 +31,61 @@ namespace rdc { // Interface class class RdcHandler { public: - // Job API - virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - const char job_id[64], uint64_t update_freq) = 0; - virtual rdc_status_t rdc_job_get_stats(const char jobId[64], - rdc_job_info_t* p_job_info)= 0; - virtual rdc_status_t rdc_job_stop_stats(const char job_id[64]) = 0; - virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; - virtual rdc_status_t rdc_job_remove_all() = 0; + // Job API + virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64], + uint64_t update_freq) = 0; + virtual rdc_status_t rdc_job_get_stats(const char jobId[64], rdc_job_info_t* p_job_info) = 0; + virtual rdc_status_t rdc_job_stop_stats(const char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; - // Discovery API - virtual rdc_status_t rdc_device_get_all( - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) = 0; - virtual rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, - rdc_device_attributes_t* p_rdc_attr) = 0; + // Discovery API + virtual rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) = 0; + virtual rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) = 0; - // Group API - virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0; - virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) = 0; - virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, - rdc_field_t* field_ids, const char* field_group_name, - rdc_field_grp_t* rdc_field_group_id) = 0; - virtual rdc_status_t rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) = 0; - virtual rdc_status_t rdc_group_gpu_get_info( - rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) = 0; - virtual rdc_status_t rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) = 0; - virtual rdc_status_t rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) = 0; - virtual rdc_status_t rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) = 0; - virtual rdc_status_t rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) = 0; + // Group API + virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) = 0; + virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) = 0; + virtual rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) = 0; + virtual rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) = 0; + virtual rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) = 0; + virtual rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], + uint32_t* count) = 0; + virtual rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) = 0; + virtual rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) = 0; - // Field API - virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) = 0; - virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, - rdc_field_t field, rdc_field_value* value) = 0; - virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) = 0; - virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) = 0; + // Field API + virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) = 0; + virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) = 0; + virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, + uint64_t* next_since_time_stamp, + rdc_field_value* value) = 0; + virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) = 0; - // Diagnostic API - virtual rdc_status_t rdc_diagnostic_run( - rdc_gpu_group_t group_id, - rdc_diag_level_t level, - rdc_diag_response_t* response) = 0; + // Diagnostic API + virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, + rdc_diag_response_t* response) = 0; - virtual rdc_status_t rdc_test_case_run( - rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, - rdc_diag_test_result_t* result) = 0; + virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) = 0; - // Control API - virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0; + // Control API + virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0; - virtual ~RdcHandler(){} + virtual ~RdcHandler() {} }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/RdcLibraryLoader.h b/projects/rdc/include/rdc_lib/RdcLibraryLoader.h index dcc6be5bd3..a495d57de5 100644 --- a/projects/rdc/include/rdc_lib/RdcLibraryLoader.h +++ b/projects/rdc/include/rdc_lib/RdcLibraryLoader.h @@ -23,78 +23,75 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_RDCLIBRARYLOADER_H_ #include #include + #include -#include // NOLINT(build/c++11) +#include // NOLINT(build/c++11) + #include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" - namespace amd { namespace rdc { class RdcLibraryLoader { public: - RdcLibraryLoader(); + RdcLibraryLoader(); - rdc_status_t load(const char* filename); + rdc_status_t load(const char* filename); - template rdc_status_t load_symbol(T* func_handler, - const char* func_name); + template + rdc_status_t load_symbol(T* func_handler, const char* func_name); - template rdc_status_t load(const char* filename, - T* func_make_handler); + template + rdc_status_t load(const char* filename, T* func_make_handler); - rdc_status_t unload(); + rdc_status_t unload(); - ~RdcLibraryLoader(); + ~RdcLibraryLoader(); private: - void* libHandler_; - std::mutex library_mutex_; + void* libHandler_; + std::mutex library_mutex_; }; -template rdc_status_t RdcLibraryLoader::load_symbol(T* func_handler, - const char* func_name) { - if (!libHandler_) { - RDC_LOG(RDC_ERROR, "Must load the library before load the symbol"); - return RDC_ST_FAIL_LOAD_MODULE; - } +template +rdc_status_t RdcLibraryLoader::load_symbol(T* func_handler, const char* func_name) { + if (!libHandler_) { + RDC_LOG(RDC_ERROR, "Must load the library before load the symbol"); + return RDC_ST_FAIL_LOAD_MODULE; + } - if (!func_handler || !func_name) { - return RDC_ST_FAIL_LOAD_MODULE; - } + if (!func_handler || !func_name) { + return RDC_ST_FAIL_LOAD_MODULE; + } - std::lock_guard guard(library_mutex_); + std::lock_guard guard(library_mutex_); - *reinterpret_cast(func_handler) = - dlsym(libHandler_, func_name); - if (*func_handler == nullptr) { - char* error = dlerror(); - RDC_LOG(RDC_ERROR, "RdcLibraryLoader: Fail to load the symbol " - << func_name << ": " << error); - return RDC_ST_FAIL_LOAD_MODULE; - } + *reinterpret_cast(func_handler) = dlsym(libHandler_, func_name); + if (*func_handler == nullptr) { + char* error = dlerror(); + RDC_LOG(RDC_ERROR, "RdcLibraryLoader: Fail to load the symbol " << func_name << ": " << error); + return RDC_ST_FAIL_LOAD_MODULE; + } - return RDC_ST_OK; + return RDC_ST_OK; } +template +rdc_status_t RdcLibraryLoader::load(const char* filename, T* func_make_handler) { + if (filename == nullptr || func_make_handler == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } -template rdc_status_t RdcLibraryLoader::load(const char* filename, - T* func_make_handler) { - if (filename == nullptr || func_make_handler == nullptr) { - return RDC_ST_FAIL_LOAD_MODULE; - } + rdc_status_t status = load(filename); + if (status != RDC_ST_OK) { + return status; + } - rdc_status_t status = load(filename); - if (status != RDC_ST_OK) { - return status; - } - - return load_symbol(func_make_handler, "make_handler"); + return load_symbol(func_make_handler, "make_handler"); } } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_RDCLIBRARYLOADER_H_ diff --git a/projects/rdc/include/rdc_lib/RdcLogger.h b/projects/rdc/include/rdc_lib/RdcLogger.h index ead06106be..4e4d74035c 100644 --- a/projects/rdc/include/rdc_lib/RdcLogger.h +++ b/projects/rdc/include/rdc_lib/RdcLogger.h @@ -21,39 +21,33 @@ THE SOFTWARE. */ #ifndef INCLUDE_RDC_LIB_RDCLOGGER_H_ #define INCLUDE_RDC_LIB_RDCLOGGER_H_ +#include // NOLINT #include #include -#include // NOLINT namespace amd { namespace rdc { class RdcLogger { public: - explicit RdcLogger(std::ostream& os); + explicit RdcLogger(std::ostream& os); - static RdcLogger& getLogger() { - static RdcLogger logger(std::cout); - return logger; - } + static RdcLogger& getLogger() { + static RdcLogger logger(std::cout); + return logger; + } - bool should_log(uint32_t severity) { - return log_level_ >= severity; - } + bool should_log(uint32_t severity) { return log_level_ >= severity; } - std::ostream& get_ostream() { - return os_; - } + std::ostream& get_ostream() { return os_; } - std::string get_log_header(uint32_t severity, - const char* file, int line); + std::string get_log_header(uint32_t severity, const char* file, int line); private: - std::ostream& os_; - uint32_t log_level_; + std::ostream& os_; + uint32_t log_level_; }; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_RDCLOGGER_H_ diff --git a/projects/rdc/include/rdc_lib/RdcMetricFetcher.h b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h index 829e93d749..6305e529fb 100644 --- a/projects/rdc/include/rdc_lib/RdcMetricFetcher.h +++ b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h @@ -24,26 +24,26 @@ THE SOFTWARE. #include #include -#include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcTelemetryLibInterface.h" -#include "rdc/rdc.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcTelemetryLibInterface.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -class RdcMetricFetcher { +class RdcMetricFetcher { public: - virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0; - virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0; + virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0; + virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0; - virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value) = 0; + virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) = 0; - virtual rdc_status_t bulk_fetch_smi_fields( - rdc_gpu_field_t* fields, uint32_t fields_count, - std::vector& results) = 0; // NOLINT - virtual ~RdcMetricFetcher() {} + virtual rdc_status_t bulk_fetch_smi_fields( + rdc_gpu_field_t* fields, uint32_t fields_count, + std::vector& results) = 0; // NOLINT + virtual ~RdcMetricFetcher() {} }; typedef std::shared_ptr RdcMetricFetcherPtr; diff --git a/projects/rdc/include/rdc_lib/RdcMetricsUpdater.h b/projects/rdc/include/rdc_lib/RdcMetricsUpdater.h index 681855aff6..36ba9374c1 100644 --- a/projects/rdc/include/rdc_lib/RdcMetricsUpdater.h +++ b/projects/rdc/include/rdc_lib/RdcMetricsUpdater.h @@ -29,8 +29,8 @@ namespace rdc { class RdcMetricsUpdater { public: - virtual void start() = 0; - virtual void stop() = 0; + virtual void start() = 0; + virtual void stop() = 0; }; typedef std::shared_ptr RdcMetricsUpdaterPtr; @@ -38,5 +38,4 @@ typedef std::shared_ptr RdcMetricsUpdaterPtr; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_RDCMETRICSUPDATER_H_ diff --git a/projects/rdc/include/rdc_lib/RdcModuleMgr.h b/projects/rdc/include/rdc_lib/RdcModuleMgr.h index cfb577cf1f..2caf0d3e59 100644 --- a/projects/rdc/include/rdc_lib/RdcModuleMgr.h +++ b/projects/rdc/include/rdc_lib/RdcModuleMgr.h @@ -23,18 +23,19 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_RDCMODULEMGR_H_ #include -#include "rdc_lib/rdc_common.h" + #include "rdc/rdc.h" -#include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/RdcDiagnostic.h" +#include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { class RdcModuleMgr { public: - virtual RdcTelemetryPtr get_telemetry_module() = 0; - virtual RdcDiagnosticPtr get_diagnostic_module() = 0; + virtual RdcTelemetryPtr get_telemetry_module() = 0; + virtual RdcDiagnosticPtr get_diagnostic_module() = 0; }; typedef std::shared_ptr RdcModuleMgrPtr; @@ -42,5 +43,4 @@ typedef std::shared_ptr RdcModuleMgrPtr; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_RDCMODULEMGR_H_ diff --git a/projects/rdc/include/rdc_lib/RdcNotification.h b/projects/rdc/include/rdc_lib/RdcNotification.h index 90f5dc912c..ae8bfb2be1 100644 --- a/projects/rdc/include/rdc_lib/RdcNotification.h +++ b/projects/rdc/include/rdc_lib/RdcNotification.h @@ -24,8 +24,9 @@ THE SOFTWARE. #include #include -#include "rdc_lib/rdc_common.h" + #include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { @@ -33,24 +34,22 @@ namespace rdc { extern const uint32_t kMaxRSMIEvents; typedef struct { - uint32_t gpu_id; - rdc_field_value field; + uint32_t gpu_id; + rdc_field_value field; } rdc_evnt_notification_t; class RdcNotification { public: - virtual bool is_notification_event(rdc_field_t field) const = 0; + virtual bool is_notification_event(rdc_field_t field) const = 0; - virtual rdc_status_t - set_listen_events(const std::vector fk_arr) = 0; + virtual rdc_status_t set_listen_events(const std::vector fk_arr) = 0; - // Blocking - virtual rdc_status_t - listen(rdc_evnt_notification_t *events, uint32_t *num_events, - uint32_t timeout_ms) = 0; + // Blocking + virtual rdc_status_t listen(rdc_evnt_notification_t* events, uint32_t* num_events, + uint32_t timeout_ms) = 0; - virtual rdc_status_t stop_listening(uint32_t gpu_id) = 0; - virtual ~RdcNotification() {} + virtual rdc_status_t stop_listening(uint32_t gpu_id) = 0; + virtual ~RdcNotification() {} }; typedef std::shared_ptr RdcNotificationPtr; @@ -59,4 +58,3 @@ typedef std::shared_ptr RdcNotificationPtr; } // namespace amd #endif // INCLUDE_RDC_LIB_RDCNOTIFICATION_H_ - diff --git a/projects/rdc/include/rdc_lib/RdcPerfTimer.h b/projects/rdc/include/rdc_lib/RdcPerfTimer.h old mode 100755 new mode 100644 index fe4b86fbc6..c96f0b1a9d --- a/projects/rdc/include/rdc_lib/RdcPerfTimer.h +++ b/projects/rdc/include/rdc_lib/RdcPerfTimer.h @@ -24,9 +24,10 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_RDCRdcPerfTimer_H_ #include + #include -#include #include +#include /// \file /// Timer related class. @@ -37,9 +38,9 @@ class RdcPerfTimer { private: struct Timer { std::string name; /* < name name of time object*/ - uint64_t _freq; /* < _freq frequency*/ + uint64_t _freq; /* < _freq frequency*/ uint64_t _clocks; /* < _clocks number of ticks at end*/ - uint64_t _start; /* < _start start point ticks*/ + uint64_t _start; /* < _start start point ticks*/ }; std::vector _timers; /*< _timers vector to Timer objects */ @@ -80,9 +81,7 @@ class RdcPerfTimer { uint64_t MeasureTSCFreqHz(); }; - } // namespace rdc } // namespace amd #endif // INCLUDE_RDC_LIB_RDCRdcPerfTimer_H_ - diff --git a/projects/rdc/include/rdc_lib/RdcTelemetry.h b/projects/rdc/include/rdc_lib/RdcTelemetry.h index 16717beb36..615f2f8b6c 100644 --- a/projects/rdc/include/rdc_lib/RdcTelemetry.h +++ b/projects/rdc/include/rdc_lib/RdcTelemetry.h @@ -23,6 +23,7 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_RDCTELEMETRY_H_ #include + #include "rdc/rdc.h" #include "rdc_lib/RdcTelemetryLibInterface.h" @@ -31,27 +32,26 @@ namespace rdc { class RdcTelemetry { public: - // get support field ids - virtual rdc_status_t rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count) = 0; + // get support field ids + virtual rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) = 0; - // Fetch - virtual rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, - uint32_t fields_count, rdc_field_value_f callback, - void* user_data) = 0; + // Fetch + virtual rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, + rdc_field_value_f callback, + void* user_data) = 0; - virtual rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count) = 0; - virtual rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count) = 0; + virtual rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) = 0; + virtual rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) = 0; - virtual ~RdcTelemetry() {} + virtual ~RdcTelemetry() {} }; typedef std::shared_ptr RdcTelemetryPtr; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_RDCTELEMETRY_H_ diff --git a/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h b/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h index 939a8628fa..e50e8e63cd 100644 --- a/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h +++ b/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h @@ -24,45 +24,40 @@ THE SOFTWARE. // The telemetry interface for libraries, for example, RAS. #include + #include extern "C" { // Structure to keep both gup index and field value typedef struct { - uint32_t gpu_index; - rdc_field_value field_value; + uint32_t gpu_index; + rdc_field_value field_value; } rdc_gpu_field_value_t; typedef struct { - uint32_t gpu_index; - rdc_field_t field_id; + uint32_t gpu_index; + rdc_field_t field_id; } rdc_gpu_field_t; #define MAX_NUM_FIELDS 8192 -typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values, - uint32_t num_values, void* user_data); +typedef rdc_status_t (*rdc_field_value_f)(rdc_gpu_field_value_t* values, uint32_t num_values, + void* user_data); // The library will implement below function -rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count); +rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count); -rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, - uint32_t fields_count, rdc_field_value_f callback, - void* user_data); +rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data); +rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count); -rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count); - -rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count); +rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count); rdc_status_t rdc_module_init(uint64_t flags); rdc_status_t rdc_module_destroy(); - } -#endif // INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ +#endif // INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ diff --git a/projects/rdc/include/rdc_lib/RdcWatchTable.h b/projects/rdc/include/rdc_lib/RdcWatchTable.h index 44abfcf374..e33962b8ab 100644 --- a/projects/rdc/include/rdc_lib/RdcWatchTable.h +++ b/projects/rdc/include/rdc_lib/RdcWatchTable.h @@ -24,33 +24,33 @@ THE SOFTWARE. #include #include -#include "rdc_lib/rdc_common.h" -#include "rdc/rdc.h" +#include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { class RdcWatchTable { public: - virtual rdc_status_t rdc_field_update_all() = 0; - virtual rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) = 0; + virtual rdc_status_t rdc_field_update_all() = 0; + virtual rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) = 0; - virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - const char job_id[64], uint64_t update_freq, - const rdc_gpu_gauges_t& gpu_gauge) = 0; - virtual rdc_status_t rdc_job_stop_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauge) = 0; - virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; - virtual rdc_status_t rdc_job_remove_all() = 0; + virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64], + uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauge) = 0; + virtual rdc_status_t rdc_job_stop_stats(const char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) = 0; + virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; - virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) = 0; - virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) = 0; + virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) = 0; + virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) = 0; - virtual ~RdcWatchTable() {} + virtual ~RdcWatchTable() {} }; typedef std::shared_ptr RdcWatchTablePtr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h index a83d78ef5b..9d27ed6a2b 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -22,15 +22,15 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_ #define INCLUDE_RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_ +#include #include #include // NOLINT(build/c++11) #include #include -#include + +#include "rdc/rdc.h" #include "rdc_lib/RdcCacheManager.h" #include "rdc_lib/rdc_common.h" -#include "rdc/rdc.h" - namespace amd { namespace rdc { @@ -39,89 +39,84 @@ namespace rdc { // types and arrays (no pointers). If a pointer is added, make sure to update // any code that copies this structure. struct RdcCacheEntry { - uint64_t last_time; - rdc_field_type_t type; - rdc_field_value_data value; + uint64_t last_time; + rdc_field_type_t type; + rdc_field_value_data value; }; typedef std::map> RdcCacheSamples; struct FieldSummaryStats { - int64_t max_value; - int64_t min_value; - int64_t total_value; + int64_t max_value; + int64_t min_value; + int64_t total_value; - // Use Welford algorithm to calculate the standard deviations. - // https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods - // https://www.johndcook.com/blog/standard_deviation/ - double old_m; - double old_s; - double new_m; - double new_s; + // Use Welford algorithm to calculate the standard deviations. + // https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods + // https://www.johndcook.com/blog/standard_deviation/ + double old_m; + double old_s; + double new_m; + double new_s; - uint64_t last_time; - uint64_t count; + uint64_t last_time; + uint64_t count; }; struct GpuSummaryStats { - uint64_t energy_consumed; - uint64_t energy_last_time; - uint64_t ecc_correct_init; // Init counter when job starts - uint64_t ecc_uncorrect_init; // Init counter when job starts - std::map field_summaries; + uint64_t energy_consumed; + uint64_t energy_last_time; + uint64_t ecc_correct_init; // Init counter when job starts + uint64_t ecc_uncorrect_init; // Init counter when job starts + std::map field_summaries; }; // Per job entry struct RdcJobStatsCacheEntry { - uint64_t start_time; - uint64_t end_time; - std::map gpu_stats; + uint64_t start_time; + uint64_t end_time; + std::map gpu_stats; }; // typedef std::map RdcJobStatsCache; -class RdcCacheManagerImpl: public RdcCacheManager { +class RdcCacheManagerImpl : public RdcCacheManager { public: - rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, - rdc_field_t field, rdc_field_value* value) override; - rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) override; - rdc_status_t rdc_update_cache(uint32_t gpu_index, - const rdc_field_value& value) override; - rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id, - uint64_t max_keep_samples, double max_keep_age) override; - std::string get_cache_stats() override; + rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) override; + rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, uint64_t* next_since_time_stamp, + rdc_field_value* value) override; + rdc_status_t rdc_update_cache(uint32_t gpu_index, const rdc_field_value& value) override; + rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id, uint64_t max_keep_samples, + double max_keep_age) override; + std::string get_cache_stats() override; - rdc_status_t rdc_job_get_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauges, - rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_start_stats(const char job_id[64], - const rdc_group_info_t& group, - const rdc_field_group_info_t& finfo, - const rdc_gpu_gauges_t& gpu_gauges) override; - rdc_status_t rdc_job_stop_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauge) override; - rdc_status_t rdc_update_job_stats(uint32_t gpu_index, - const std::string& job_id, - const rdc_field_value& value) override; - rdc_status_t rdc_job_remove(const char job_id[64]) override; - rdc_status_t rdc_job_remove_all() override; + rdc_status_t rdc_job_get_stats(const char job_id[64], const rdc_gpu_gauges_t& gpu_gauges, + rdc_job_info_t* p_job_info) override; + rdc_status_t rdc_job_start_stats(const char job_id[64], const rdc_group_info_t& group, + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) override; + rdc_status_t rdc_job_stop_stats(const char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) override; + rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id, + const rdc_field_value& value) override; + rdc_status_t rdc_job_remove(const char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; private: - void set_summary(const FieldSummaryStats & stats, - rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT - unsigned int adjuster); - void set_average_summary( - rdc_stats_summary_t& summary, uint32_t num_gpus); // NOLINT - RdcCacheSamples cache_samples_; - RdcJobStatsCache cache_jobs_; - std::mutex cache_mutex_; + void set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu, + rdc_stats_summary_t& summary, // NOLINT + unsigned int adjuster); + void set_average_summary(rdc_stats_summary_t& summary, + uint32_t num_gpus); // NOLINT + RdcCacheSamples cache_samples_; + RdcJobStatsCache cache_jobs_; + std::mutex cache_mutex_; }; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h b/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h index 8b7e2a42c8..dafef2cf0d 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h +++ b/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h @@ -22,10 +22,11 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ #define INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ -#include #include -#include +#include #include +#include + #include "rdc_lib/RdcDiagnostic.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetryLibInterface.h" @@ -35,37 +36,30 @@ namespace rdc { class RdcDiagnosticModule : public RdcDiagnostic { public: - rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) override; + rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; - // Run a specific test case - rdc_status_t rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) override; + // Run a specific test case + rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result) override; - rdc_status_t rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) override; + rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) override; - rdc_status_t rdc_diag_init(uint64_t flags) override; - rdc_status_t rdc_diag_destroy() override; + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; - explicit RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher); + explicit RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher); private: - //< Helper function to dispatch fields to module - void get_fields_for_module( - rdc_gpu_field_t* fields, - uint32_t fields_count, - std::map> - & fields_in_module, - std::vector& unsupport_fields); // NOLINT - std::list diagnostic_modules_; - std::map testcases_to_module_; + //< Helper function to dispatch fields to module + void get_fields_for_module( + rdc_gpu_field_t* fields, uint32_t fields_count, + std::map>& fields_in_module, + std::vector& unsupport_fields); // NOLINT + std::list diagnostic_modules_; + std::map testcases_to_module_; }; typedef std::shared_ptr RdcDiagnosticModulePtr; @@ -73,5 +67,4 @@ typedef std::shared_ptr RdcDiagnosticModulePtr; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 5f1b9662af..edbb6c1bc4 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -23,102 +23,91 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_IMPL_RDCEMBEDDEDHANDLER_H_ #include // NOLINT(build/c++11) -#include "rdc_lib/RdcHandler.h" -#include "rdc_lib/RdcGroupSettings.h" -#include "rdc_lib/RdcMetricFetcher.h" + #include "rdc_lib/RdcCacheManager.h" +#include "rdc_lib/RdcGroupSettings.h" +#include "rdc_lib/RdcHandler.h" +#include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcMetricsUpdater.h" -#include "rdc_lib/RdcWatchTable.h" #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcNotification.h" +#include "rdc_lib/RdcWatchTable.h" namespace amd { namespace rdc { -class RdcEmbeddedHandler: public RdcHandler { +class RdcEmbeddedHandler : public RdcHandler { public: - // Job API - rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - const char job_id[64], uint64_t update_freq) override; - rdc_status_t rdc_job_get_stats(const char jobId[64], - rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_stop_stats(const char job_id[64]) override; - rdc_status_t rdc_job_remove(const char job_id[64]) override; - rdc_status_t rdc_job_remove_all() override; + // Job API + rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64], + uint64_t update_freq) override; + rdc_status_t rdc_job_get_stats(const char jobId[64], rdc_job_info_t* p_job_info) override; + rdc_status_t rdc_job_stop_stats(const char job_id[64]) override; + rdc_status_t rdc_job_remove(const char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; - // Discovery API - rdc_status_t rdc_device_get_all( - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override; - rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, - rdc_device_attributes_t* p_rdc_attr) override; + // Discovery API + rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) override; + rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) override; - // Group API - rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) override; - rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) override; - rdc_status_t rdc_group_field_create(uint32_t num_field_ids, - rdc_field_t* field_ids, const char* field_group_name, - rdc_field_grp_t* rdc_field_group_id) override; - rdc_status_t rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) override; - rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, - rdc_group_info_t* p_rdc_group_info) override; - rdc_status_t rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) override; - rdc_status_t rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) override; - rdc_status_t rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) override; - rdc_status_t rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) override; + // Group API + rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) override; + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) override; + rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], + uint32_t* count) override; + rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) override; - // Field API - rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) override; - rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, - rdc_field_t field, rdc_field_value* value) override; - rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) override; - rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) override; - // Diagnostic API - rdc_status_t rdc_diagnostic_run( - rdc_gpu_group_t group_id, - rdc_diag_level_t level, - rdc_diag_response_t* response) override; - rdc_status_t rdc_test_case_run( - rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, - rdc_diag_test_result_t* result) override; + // Field API + rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) override; + rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) override; + rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, uint64_t* next_since_time_stamp, + rdc_field_value* value) override; + rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + // Diagnostic API + rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, + rdc_diag_response_t* response) override; + rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) override; - // Control API - rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; + // Control API + rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; - explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); - ~RdcEmbeddedHandler(); + explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); + ~RdcEmbeddedHandler(); private: - rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges); - RdcGroupSettingsPtr group_settings_; - RdcCacheManagerPtr cache_mgr_; - RdcMetricFetcherPtr metric_fetcher_; - RdcModuleMgrPtr rdc_module_mgr_; - RdcNotificationPtr rdc_notif_; - RdcWatchTablePtr watch_table_; - RdcMetricsUpdaterPtr metrics_updater_; - std::future updater_; + rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges); + RdcGroupSettingsPtr group_settings_; + RdcCacheManagerPtr cache_mgr_; + RdcMetricFetcherPtr metric_fetcher_; + RdcModuleMgrPtr rdc_module_mgr_; + RdcNotificationPtr rdc_notif_; + RdcWatchTablePtr watch_table_; + RdcMetricsUpdaterPtr metrics_updater_; + std::future updater_; }; } // namespace rdc } // namespace amd extern "C" { - amd::rdc::RdcHandler *make_handler(rdc_operation_mode_t op_mode); +amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode); } #endif // INCLUDE_RDC_LIB_IMPL_RDCEMBEDDEDHANDLER_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h index c57fe715dc..268d8e0077 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h @@ -22,52 +22,47 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ #define INCLUDE_RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ -#include #include -#include // NOLINT +#include +#include // NOLINT #include + #include "rdc_lib/RdcGroupSettings.h" namespace amd { namespace rdc { -class RdcGroupSettingsImpl: public RdcGroupSettings { +class RdcGroupSettingsImpl : public RdcGroupSettings { public: - rdc_status_t rdc_group_gpu_create( - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) override; - rdc_status_t rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) override; - rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) override; - rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, - rdc_group_info_t* p_rdc_group_info) override; - rdc_status_t rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) override; + rdc_status_t rdc_group_gpu_create(const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) override; - rdc_status_t rdc_group_field_create(uint32_t num_field_ids, - rdc_field_t* field_ids, const char* field_group_name, - rdc_field_grp_t* rdc_field_group_id) override; - rdc_status_t rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) override; - rdc_status_t rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) override; - rdc_status_t rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) override; + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], + uint32_t* count) override; - RdcGroupSettingsImpl(); + RdcGroupSettingsImpl(); private: - std::map gpu_group_; - std::map field_group_; - uint32_t cur_group_id_ = 1; - uint32_t cur_field_group_id_ = 0; - std::mutex group_mutex_; - std::mutex field_group_mutex_; + std::map gpu_group_; + std::map field_group_; + uint32_t cur_group_id_ = 1; + uint32_t cur_field_group_id_ = 0; + std::mutex group_mutex_; + std::mutex field_group_mutex_; }; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h index e188f2bd65..a0ddbe44c8 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -22,12 +22,13 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ #define INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ -#include // NOLINT(build/c++11) -#include // NOLINT(build/c++11) -#include #include // NOLINT(build/c++11) +#include // NOLINT(build/c++11) #include +#include +#include // NOLINT(build/c++11) #include + #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/rdc_common.h" #include "rocm_smi/rocm_smi.h" @@ -38,9 +39,9 @@ namespace rdc { //!< Some metrics, like PCIe throughput may take a second to retreive. The //!< MetricValue will cache those metrics for async retreive. struct MetricValue { - uint64_t cache_ttl; - uint64_t last_time; - rdc_field_value value; + uint64_t cache_ttl; + uint64_t last_time; + rdc_field_value value; }; // This union represents any RSMI handles require initialization and/or @@ -49,56 +50,54 @@ struct MetricValue { // underlying raw event, then only one FieldRSMIData should be created, // and it should be used by both events. struct FieldRSMIData { - union { - rsmi_event_handle_t evt_handle; - }; - union { - rsmi_counter_value_t counter_val; - }; - ~FieldRSMIData() {} - FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0}{} + union { + rsmi_event_handle_t evt_handle; + }; + union { + rsmi_counter_value_t counter_val; + }; + ~FieldRSMIData() {} + FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0} {} }; //!< The data structure to store the async fetch task class RdcMetricFetcherImpl; struct MetricTask { - RdcFieldKey field; - std::function task; + RdcFieldKey field; + std::function task; }; -class RdcMetricFetcherImpl: public RdcMetricFetcher { +class RdcMetricFetcherImpl : public RdcMetricFetcher { public: - rdc_status_t fetch_smi_field(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value) override; - rdc_status_t bulk_fetch_smi_fields( - rdc_gpu_field_t* fields, uint32_t fields_count, - std::vector& results) override; // NOLINT - RdcMetricFetcherImpl(); - ~RdcMetricFetcherImpl(); + rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) override; + rdc_status_t bulk_fetch_smi_fields( + rdc_gpu_field_t* fields, uint32_t fields_count, + std::vector& results) override; // NOLINT + RdcMetricFetcherImpl(); + ~RdcMetricFetcherImpl(); - rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override; - rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override; + rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override; + rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override; private: - std::shared_ptr get_rsmi_data(RdcFieldKey key); + std::shared_ptr get_rsmi_data(RdcFieldKey key); - uint64_t now(); - void get_ecc_error(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value); + uint64_t now(); + void get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value); - //!< return true if starting async_get - bool async_get_pcie_throughput(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value); - void get_pcie_throughput(const RdcFieldKey& key); + //!< return true if starting async_get + bool async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value); + void get_pcie_throughput(const RdcFieldKey& key); - //!< Async metric retreive - std::map async_metrics_; - std::map> rsmi_data_; - std::queue updated_tasks_; - std::mutex task_mutex_; - std::future updater_; // keep the future of updater - std::condition_variable cv_; - std::atomic task_started_; + //!< Async metric retreive + std::map async_metrics_; + std::map> rsmi_data_; + std::queue updated_tasks_; + std::mutex task_mutex_; + std::future updater_; // keep the future of updater + std::condition_variable cv_; + std::atomic task_started_; }; rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi); diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h index c37e1f6d5d..ffc54e9f35 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h @@ -24,24 +24,26 @@ THE SOFTWARE. #include // NOLINT(build/c++11) #include + #include "rdc_lib/RdcMetricsUpdater.h" #include "rdc_lib/RdcWatchTable.h" namespace amd { namespace rdc { -class RdcMetricsUpdaterImpl: public RdcMetricsUpdater { +class RdcMetricsUpdaterImpl : public RdcMetricsUpdater { public: - void start() override; - void stop() override; - explicit RdcMetricsUpdaterImpl(const RdcWatchTablePtr& watch_table, - const uint32_t check_frequency); + void start() override; + void stop() override; + explicit RdcMetricsUpdaterImpl(const RdcWatchTablePtr& watch_table, + const uint32_t check_frequency); + private: - RdcWatchTablePtr watch_table_; - std::atomic started_; - std::future updater_; // keep the future of updater - std::future notif_updater_; // keep the future of notif updater - const uint32_t _check_frequency; // Check frequency in milliseconds + RdcWatchTablePtr watch_table_; + std::atomic started_; + std::future updater_; // keep the future of updater + std::future notif_updater_; // keep the future of notif updater + const uint32_t _check_frequency; // Check frequency in milliseconds }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h b/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h index 6cb0b71013..ed19f2e824 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -36,17 +36,17 @@ namespace rdc { class RdcModuleMgrImpl : public RdcModuleMgr { public: - RdcTelemetryPtr get_telemetry_module() override; - RdcDiagnosticPtr get_diagnostic_module() override; - explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher); + RdcTelemetryPtr get_telemetry_module() override; + RdcDiagnosticPtr get_diagnostic_module() override; + explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher); private: - // Function module - RdcTelemetryPtr rdc_telemetry_module_; - RdcDiagnosticPtr rdc_diagnostic_module_; + // Function module + RdcTelemetryPtr rdc_telemetry_module_; + RdcDiagnosticPtr rdc_diagnostic_module_; - // Domain module - RdcMetricFetcherPtr fetcher_; + // Domain module + RdcMetricFetcherPtr fetcher_; }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/impl/RdcNotificationImpl.h b/projects/rdc/include/rdc_lib/impl/RdcNotificationImpl.h index abd2f0bf47..1934a50e0c 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcNotificationImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcNotificationImpl.h @@ -22,40 +22,36 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_ #define INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_ -#include -#include #include +#include #include +#include -#include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcNotification.h" #include "rdc/rdc.h" - +#include "rdc_lib/RdcNotification.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { class RdcNotificationImpl : public RdcNotification { public: - RdcNotificationImpl(); - ~RdcNotificationImpl(); + RdcNotificationImpl(); + ~RdcNotificationImpl(); - bool is_notification_event(rdc_field_t field) const override; - rdc_status_t set_listen_events( - const std::vector fk_arr) override; - // Blocking - rdc_status_t listen(rdc_evnt_notification_t *events, - uint32_t *num_events, uint32_t timeout_ms) override; - rdc_status_t stop_listening(uint32_t gpu_id) override; + bool is_notification_event(rdc_field_t field) const override; + rdc_status_t set_listen_events(const std::vector fk_arr) override; + // Blocking + rdc_status_t listen(rdc_evnt_notification_t* events, uint32_t* num_events, + uint32_t timeout_ms) override; + rdc_status_t stop_listening(uint32_t gpu_id) override; private: - std::map gpu_evnt_notif_masks_; - std::mutex notif_mutex_; + std::map gpu_evnt_notif_masks_; + std::mutex notif_mutex_; }; - } // namespace rdc } // namespace amd #endif // INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_ - diff --git a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h index 1e55f50400..9d38ab91d1 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h @@ -22,76 +22,66 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_ #define INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_ -#include -#include -#include -#include #include +#include +#include +#include #include +#include + +#include "rdc_lib/RdcDiagnostic.h" #include "rdc_lib/RdcLibraryLoader.h" #include "rdc_lib/RdcTelemetry.h" -#include "rdc_lib/RdcDiagnostic.h" - namespace amd { namespace rdc { -class RdcRasLib: public RdcTelemetry, public RdcDiagnostic { +class RdcRasLib : public RdcTelemetry, public RdcDiagnostic { public: - // get support field ids - rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count) override; + // get support field ids + rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) override; - // Fetch - rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, - uint32_t fields_count, rdc_field_value_f callback, - void* user_data) override; + // Fetch + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data) override; - rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override; - rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; - rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) override; + rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; - // Run a specific test case - rdc_status_t rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) override; + // Run a specific test case + rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result) override; - rdc_status_t rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) override; + rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) override; - rdc_status_t rdc_diag_init(uint64_t flags) override; - rdc_status_t rdc_diag_destroy() override; + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; - explicit RdcRasLib(); + RdcRasLib(); - ~RdcRasLib(); + ~RdcRasLib(); private: - RdcLibraryLoader lib_loader_; - rdc_status_t (*fields_value_get_)(rdc_gpu_field_t*, - uint32_t, rdc_field_value_f, void*); - rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*); + RdcLibraryLoader lib_loader_; + rdc_status_t (*fields_value_get_)(rdc_gpu_field_t*, uint32_t, rdc_field_value_f, void*); + rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*); - rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t); - rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t); + rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t); + rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t); - rdc_status_t (*rdc_module_init_)(uint64_t); - rdc_status_t (*rdc_module_destroy_)(); + rdc_status_t (*rdc_module_init_)(uint64_t); + rdc_status_t (*rdc_module_destroy_)(); }; typedef std::shared_ptr RdcRasLibPtr; - } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h b/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h index f793c46069..7ca544f5f9 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRocpLib.h @@ -34,63 +34,48 @@ namespace rdc { class RdcRocpLib : public RdcTelemetry { public: - /* Telemetry */ + /* Telemetry */ - // get support field ids - rdc_status_t rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count) override; + // get support field ids + rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) override; - // Fetch - rdc_status_t rdc_telemetry_fields_value_get( - rdc_gpu_field_t* fields, - uint32_t fields_count, - rdc_field_value_f callback, - void* user_data) override; + // Fetch + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data) override; - rdc_status_t rdc_telemetry_fields_watch( - rdc_gpu_field_t* fields, - uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override; - rdc_status_t rdc_telemetry_fields_unwatch( - rdc_gpu_field_t* fields, - uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; - explicit RdcRocpLib(const char* lib_name); + explicit RdcRocpLib(const char* lib_name); - ~RdcRocpLib(); + ~RdcRocpLib(); private: - RdcLibraryLoader lib_loader_; + RdcLibraryLoader lib_loader_; - rdc_status_t (*telemetry_fields_query_)( - uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count); + rdc_status_t (*telemetry_fields_query_)(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count); - rdc_status_t (*telemetry_fields_value_get_)( - rdc_gpu_field_t* fields, - uint32_t fields_count, - rdc_field_value_f callback, - void* user_data); + rdc_status_t (*telemetry_fields_value_get_)(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data); - rdc_status_t (*telemetry_fields_watch_)( - rdc_gpu_field_t* fields, - uint32_t fields_count); + rdc_status_t (*telemetry_fields_watch_)(rdc_gpu_field_t* fields, uint32_t fields_count); - rdc_status_t (*telemetry_fields_unwatch_)( - rdc_gpu_field_t* fields, - uint32_t fields_count); + rdc_status_t (*telemetry_fields_unwatch_)(rdc_gpu_field_t* fields, uint32_t fields_count); - /** - * @brief Extract current ROCM_PATH from library or the environment - */ - std::string get_rocm_path(); + /** + * @brief Extract current ROCM_PATH from library or the environment + */ + std::string get_rocm_path(); - /** - * @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by - * librocmtools - */ - rdc_status_t set_rocmtools_path(); + /** + * @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by + * librocmtools + */ + rdc_status_t set_rocmtools_path(); }; using RdcRocpLibPtr = std::shared_ptr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h b/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h index 9b28905c4f..423cd31e3e 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h @@ -22,48 +22,42 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCROCRLIB_H_ #define INCLUDE_RDC_LIB_IMPL_RDCROCRLIB_H_ -#include #include -#include "rdc_lib/RdcLibraryLoader.h" +#include + #include "rdc_lib/RdcDiagnostic.h" +#include "rdc_lib/RdcLibraryLoader.h" namespace amd { namespace rdc { class RdcRocrLib : public RdcDiagnostic { public: - rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) override; + rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; - // Run a specific test case - rdc_status_t rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) override; + // Run a specific test case + rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result) override; - rdc_status_t rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) override; + rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) override; - rdc_status_t rdc_diag_init(uint64_t flags) override; - rdc_status_t rdc_diag_destroy() override; + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; - explicit RdcRocrLib(); + RdcRocrLib(); - ~RdcRocrLib(); + ~RdcRocrLib(); private: - RdcLibraryLoader lib_loader_; - rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, - uint32_t[RDC_MAX_NUM_DEVICES], uint32_t, - rdc_diag_test_result_t*); - rdc_status_t (*diag_test_cases_query_)( - rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*); - rdc_status_t (*diag_init_)(uint64_t); - rdc_status_t (*diag_destroy_)(); + RdcLibraryLoader lib_loader_; + rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t, + rdc_diag_test_result_t*); + rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*); + rdc_status_t (*diag_init_)(uint64_t); + rdc_status_t (*diag_destroy_)(); }; typedef std::shared_ptr RdcRocrLibPtr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h b/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h index cc275f43b3..cece59d0ff 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h @@ -23,8 +23,9 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_ #include #include -#include "rdc_lib/rdc_common.h" + #include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" #include "rocm_smi/rocm_smi.h" namespace amd { @@ -32,35 +33,25 @@ namespace rdc { class RdcSmiDiagnosticImpl { public: - RdcSmiDiagnosticImpl(); + RdcSmiDiagnosticImpl(); - rdc_status_t check_rsmi_process_info( - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result); - rdc_status_t check_rsmi_topo_info( - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result); - rdc_status_t check_rsmi_param_info( - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result); + rdc_status_t check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result); + rdc_status_t check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result); + rdc_status_t check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result); private: - rdc_diag_result_t check_temperature_level(uint32_t gpu_index - , rsmi_temperature_type_t type - , char msg[MAX_DIAG_MSG_LENGTH] - , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); - std::string get_temperature_string( - rsmi_temperature_type_t type) const; + rdc_diag_result_t check_temperature_level(uint32_t gpu_index, rsmi_temperature_type_t type, + char msg[MAX_DIAG_MSG_LENGTH], + char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); + std::string get_temperature_string(rsmi_temperature_type_t type) const; - rdc_diag_result_t check_voltage_level(uint32_t gpu_index - , rsmi_voltage_type_t type - , char msg[MAX_DIAG_MSG_LENGTH] - , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); - std::string get_voltage_string( - rsmi_voltage_type_t type) const; + rdc_diag_result_t check_voltage_level(uint32_t gpu_index, rsmi_voltage_type_t type, + char msg[MAX_DIAG_MSG_LENGTH], + char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); + std::string get_voltage_string(rsmi_voltage_type_t type) const; }; typedef std::shared_ptr RdcSmiDiagnosticPtr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h b/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h index 6c92bde387..943ceda29e 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h @@ -22,11 +22,12 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_ #define INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_ -#include #include +#include + +#include "rdc_lib/RdcDiagnostic.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" -#include "rdc_lib/RdcDiagnostic.h" #include "rdc_lib/impl/RdcSmiDiagnosticImpl.h" namespace amd { @@ -34,45 +35,38 @@ namespace rdc { class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic { public: - // get support field ids - rdc_status_t rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) override; + // get support field ids + rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) override; - // Fetch - rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, - uint32_t fields_count, rdc_field_value_f callback, - void* user_data) override; + // Fetch + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data) override; - rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count) override; - rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; - rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) override; + rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; - // Run a specific test case - rdc_status_t rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) override; + // Run a specific test case + rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result) override; - rdc_status_t rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) override; + rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) override; - rdc_status_t rdc_diag_init(uint64_t flags) override; - rdc_status_t rdc_diag_destroy() override; + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; - explicit RdcSmiLib(const RdcMetricFetcherPtr& mf); + explicit RdcSmiLib(const RdcMetricFetcherPtr& mf); private: - RdcMetricFetcherPtr metric_fetcher_; - bool bulk_fetch_enabled_; - RdcSmiDiagnosticPtr smi_diag_; + RdcMetricFetcherPtr metric_fetcher_; + bool bulk_fetch_enabled_; + RdcSmiDiagnosticPtr smi_diag_; }; typedef std::shared_ptr RdcSmiLibPtr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index 4cb4662bc8..065c989025 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -22,98 +22,85 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_ #define INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_ #include + #include + #include "rdc.grpc.pb.h" // NOLINT #include "rdc_lib/RdcHandler.h" namespace amd { namespace rdc { -class RdcStandaloneHandler: public RdcHandler { +class RdcStandaloneHandler : public RdcHandler { public: - // Job RdcAPI - rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - const char job_id[64], uint64_t update_freq) override; - rdc_status_t rdc_job_get_stats(const char jobId[64], - rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_stop_stats(const char job_id[64]) override; - rdc_status_t rdc_job_remove(const char job_id[64]) override; - rdc_status_t rdc_job_remove_all() override; + // Job RdcAPI + rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64], + uint64_t update_freq) override; + rdc_status_t rdc_job_get_stats(const char jobId[64], rdc_job_info_t* p_job_info) override; + rdc_status_t rdc_job_stop_stats(const char job_id[64]) override; + rdc_status_t rdc_job_remove(const char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; - // Discovery RdcAPI - rdc_status_t rdc_device_get_all( - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override; - rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, - rdc_device_attributes_t* p_rdc_attr) override; + // Discovery RdcAPI + rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) override; + rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) override; - // Group RdcAPI - rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) override; - rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, - uint32_t gpu_index) override; - rdc_status_t rdc_group_field_create(uint32_t num_field_ids, - rdc_field_t* field_ids, const char* field_group_name, - rdc_field_grp_t* rdc_field_group_id) override; - rdc_status_t rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) override; - rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, - rdc_group_info_t* p_rdc_group_info) override; - rdc_status_t rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) override; - rdc_status_t rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) override; - rdc_status_t rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) override; - rdc_status_t rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) override; + // Group RdcAPI + rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) override; + rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) override; + rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) override; + rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) override; + rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) override; + rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) override; + rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], + uint32_t* count) override; + rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) override; + rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) override; - // Field RdcAPI - rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) override; - rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, - rdc_field_t field, rdc_field_value* value) override; - rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) override; - rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) override; - // Diagnostic API - rdc_status_t rdc_diagnostic_run( - rdc_gpu_group_t group_id, - rdc_diag_level_t level, - rdc_diag_response_t* response) override; - rdc_status_t rdc_test_case_run( - rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, - rdc_diag_test_result_t* result) override; + // Field RdcAPI + rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) override; + rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) override; + rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, uint64_t* next_since_time_stamp, + rdc_field_value* value) override; + rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + // Diagnostic API + rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, + rdc_diag_response_t* response) override; + rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) override; - // Control RdcAPI - rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; + // Control RdcAPI + rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; - explicit RdcStandaloneHandler(const char* ip_and_port, - const char* root_ca, const char* client_cert, const char* client_key); + explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca, + const char* client_cert, const char* client_key); private: - // Helper function to handle the error - rdc_status_t error_handle(::grpc::Status status, uint32_t rdc_status); + // Helper function to handle the error + rdc_status_t error_handle(::grpc::Status status, uint32_t rdc_status); - bool copy_gpu_usage_info( - const ::rdc::GpuUsageInfo& src, - rdc_gpu_usage_info_t* target); + bool copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, rdc_gpu_usage_info_t* target); - std::unique_ptr<::rdc::RdcAPI::Stub> stub_; + std::unique_ptr<::rdc::RdcAPI::Stub> stub_; }; - } // namespace rdc } // namespace amd extern "C" { - amd::rdc::RdcHandler *make_handler(const char* ip_port, - const char* root_ca, const char* client_cert, const char* client_key); +amd::rdc::RdcHandler* make_handler(const char* ip_port, const char* root_ca, + const char* client_cert, const char* client_key); } #endif // INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h b/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h index c195011a5d..b0282b84bc 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h +++ b/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h @@ -22,45 +22,41 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ #define INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ -#include #include -#include +#include #include +#include + +#include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" #include "rdc_lib/impl/RdcSmiLib.h" -#include "rdc_lib/RdcMetricFetcher.h" namespace amd { namespace rdc { class RdcTelemetryModule : public RdcTelemetry { public: - rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, - uint32_t fields_count, rdc_field_value_f callback, - void* user_data); + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data); - rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count); + rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count); - rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count); + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count); - rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count); + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count); - RdcTelemetryModule(RdcMetricFetcherPtr fetcher); + explicit RdcTelemetryModule(RdcMetricFetcherPtr fetcher); private: - //< Helper function to dispatch fields to module - void get_fields_for_module( - rdc_gpu_field_t* fields, - uint32_t fields_count, - std::map> - & fields_in_module, - std::vector& unsupport_fields); // NOLINT - std::list telemetry_modules_; - std::map fields_id_module_; + //< Helper function to dispatch fields to module + void get_fields_for_module( + rdc_gpu_field_t* fields, uint32_t fields_count, + std::map>& fields_in_module, + std::vector& unsupport_fields); // NOLINT + std::list telemetry_modules_; + std::map fields_id_module_; }; typedef std::shared_ptr RdcTelemetryModulePtr; @@ -68,5 +64,4 @@ typedef std::shared_ptr RdcTelemetryModulePtr; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index 297650195d..6635f683a3 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -22,19 +22,20 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ #define INCLUDE_RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ -#include +#include #include -#include -#include #include #include // NOLINT -#include -#include "rdc_lib/RdcWatchTable.h" -#include "rdc_lib/RdcGroupSettings.h" +#include +#include +#include + #include "rdc_lib/RdcCacheManager.h" +#include "rdc_lib/RdcGroupSettings.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcNotification.h" +#include "rdc_lib/RdcWatchTable.h" #include "rocm_smi/rocm_smi.h" namespace amd { @@ -42,104 +43,95 @@ namespace rdc { //!< The settings for a field or a group of field in the watch table. struct FieldSettings { - uint64_t update_freq; - uint32_t max_keep_samples; - double max_keep_age; - bool is_watching; - uint64_t last_update_time; + uint64_t update_freq; + uint32_t max_keep_samples; + double max_keep_age; + bool is_watching; + uint64_t last_update_time; }; struct JobWatchTableEntry { - uint32_t group_id; - std::vector fields; //< store fields for faster query + uint32_t group_id; + std::vector fields; //< store fields for faster query }; - class RdcWatchTableImpl : public RdcWatchTable { public: - rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - const char job_id[64], uint64_t update_freq, - const rdc_gpu_gauges_t& gpu_gauge) override; - rdc_status_t rdc_job_stop_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauge) override; - rdc_status_t rdc_job_remove(const char job_id[64]) override; - rdc_status_t rdc_job_remove_all() override; + rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64], + uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauge) override; + rdc_status_t rdc_job_stop_stats(const char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) override; + rdc_status_t rdc_job_remove(const char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; - rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) override; + rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) override; - //!< rdc_field_unwatch() will not remove the entry from watch_table. - //!< The unwatched entry is still kept until the max_keep_age of the entry - //!< is reached, which will be handled in the clean_up() function. - rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) override; + //!< rdc_field_unwatch() will not remove the entry from watch_table. + //!< The unwatched entry is still kept until the max_keep_age of the entry + //!< is reached, which will be handled in the clean_up() function. + rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; - //!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will - //!< call this function periodically. Instead of providing other APIs to - //!< cleanup the cache, this function will update and cleanup the cache. - //!< - //!< This function may be called very frequently, and the cache cleanup - //!< is expensive. Internally, this function will throttle the cleanup to - //!< once per second. - rdc_status_t rdc_field_update_all() override; - rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override; + //!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will + //!< call this function periodically. Instead of providing other APIs to + //!< cleanup the cache, this function will update and cleanup the cache. + //!< + //!< This function may be called very frequently, and the cache cleanup + //!< is expensive. Internally, this function will throttle the cleanup to + //!< once per second. + rdc_status_t rdc_field_update_all() override; + rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override; - RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, - const RdcCacheManagerPtr& cache_mgr, - const RdcModuleMgrPtr& module_mgr, - const RdcNotificationPtr& notif); + RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, + const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif); private: - //!< Helper function to Update the fields_in_table when unwatch tables - rdc_status_t update_field_in_table_when_unwatch( - const RdcFieldGroupKey& entry); + //!< Helper function to Update the fields_in_table when unwatch tables + rdc_status_t update_field_in_table_when_unwatch(const RdcFieldGroupKey& entry); - //!< Helper function to clean up the watch table and cache - void clean_up(); + //!< Helper function to clean up the watch table and cache + void clean_up(); - //!< Helper function for debug information in watch table and cache - void debug_status(); + //!< Helper function for debug information in watch table and cache + void debug_status(); - //!< Helper function to get the fields using the group and the field group. - rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, - std::vector & fields); // NOLINT + //!< Helper function to get the fields using the group and the field group. + rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, + std::vector& fields); // NOLINT - bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id, - std::string& job_id) const; // NOLINT + bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id, + std::string& job_id) const; // NOLINT - rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t *events, - uint32_t num_events); - //!< The function will be pass as the callback for bulk fetch - static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, - uint32_t num_values, void* user_data); + rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t* events, uint32_t num_events); + //!< The function will be pass as the callback for bulk fetch + static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values, + void* user_data); - RdcGroupSettingsPtr group_settings_; - RdcCacheManagerPtr cache_mgr_; - RdcModuleMgrPtr rdc_module_mgr_; - RdcNotificationPtr notifications_; + RdcGroupSettingsPtr group_settings_; + RdcCacheManagerPtr cache_mgr_; + RdcModuleMgrPtr rdc_module_mgr_; + RdcNotificationPtr notifications_; - //!< The watch table to store the watch settings. - std::map watch_table_; + //!< The watch table to store the watch settings. + std::map watch_table_; - //!< pairs - std::map job_watch_table_; + //!< pairs + std::map job_watch_table_; + //!< The settings for each field can be deduced from watch_table. But every + //!< rdc_field_update_all() call needs to deduce them. To improve the + //!< performance, the fields_to_watch_ is used to track the field settings. + //!< Those settings will only be updated when watching or unwatching. + std::map fields_to_watch_; - //!< The settings for each field can be deduced from watch_table. But every - //!< rdc_field_update_all() call needs to deduce them. To improve the - //!< performance, the fields_to_watch_ is used to track the field settings. - //!< Those settings will only be updated when watching or unwatching. - std::map fields_to_watch_; - - //!< The last clean up time - std::atomic last_cleanup_time_; - std::mutex watch_mutex_; + //!< The last clean up time + std::atomic last_cleanup_time_; + std::mutex watch_mutex_; }; } // namespace rdc } // namespace amd - #endif // INCLUDE_RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RsmiUtils.h b/projects/rdc/include/rdc_lib/impl/RsmiUtils.h index a49bf63586..92044a107d 100644 --- a/projects/rdc/include/rdc_lib/impl/RsmiUtils.h +++ b/projects/rdc/include/rdc_lib/impl/RsmiUtils.h @@ -34,4 +34,3 @@ rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi); } // namespace amd #endif // INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ - diff --git a/projects/rdc/include/rdc_lib/rdc_common.h b/projects/rdc/include/rdc_lib/rdc_common.h index b459741f1b..617b35544c 100644 --- a/projects/rdc/include/rdc_lib/rdc_common.h +++ b/projects/rdc/include/rdc_lib/rdc_common.h @@ -28,18 +28,18 @@ THE SOFTWARE. #include "rdc/rdc.h" -#define RDC_ERROR 0 -#define RDC_INFO 1 -#define RDC_DEBUG 2 +#define RDC_ERROR 0 +#define RDC_INFO 1 +#define RDC_DEBUG 2 -#define RDC_LOG(debug_level, msg) do { \ - auto& logger = amd::rdc::RdcLogger::getLogger(); \ - if (logger.should_log((debug_level))) { \ - logger.get_ostream() << \ - logger.get_log_header((debug_level), __FILE__, __LINE__) << \ - msg << std::endl; \ - } \ -} while (0) +#define RDC_LOG(debug_level, msg) \ + do { \ + auto& logger = amd::rdc::RdcLogger::getLogger(); \ + if (logger.should_log((debug_level))) { \ + logger.get_ostream() << logger.get_log_header((debug_level), __FILE__, __LINE__) << msg \ + << std::endl; \ + } \ + } while (0) // typedef std::pair RdcFieldKey; @@ -64,7 +64,6 @@ typedef std::map rdc_gpu_gauges_t; * * @retval Return a pointer to the destination string. */ -char *strncpy_with_null(char *dest, const char *src, size_t n); - +char* strncpy_with_null(char* dest, const char* src, size_t n); #endif // INCLUDE_RDC_LIB_RDC_COMMON_H_ diff --git a/projects/rdc/include/rdc_modules/rdc_rocp/RdcRocpBase.h b/projects/rdc/include/rdc_modules/rdc_rocp/RdcRocpBase.h index 0d506da60c..6b1f47ac0f 100644 --- a/projects/rdc/include/rdc_modules/rdc_rocp/RdcRocpBase.h +++ b/projects/rdc/include/rdc_modules/rdc_rocp/RdcRocpBase.h @@ -23,6 +23,7 @@ THE SOFTWARE. #ifndef RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_ #define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_ #include + #include #include #include @@ -30,6 +31,7 @@ THE SOFTWARE. #include #include #include + #include "rdc/rdc.h" namespace amd { @@ -62,73 +64,69 @@ static const std::unordered_map counter_map_k = { /// Common interface for RocP tests and samples class RdcRocpBase { - typedef std::pair pair_gpu_field_t; - typedef struct session_info_t { - rocmtools_session_id_t id{}; - std::chrono:: - time_point - start_time; - std::chrono:: - time_point - stop_time; - } session_info_t; + typedef std::pair pair_gpu_field_t; + typedef struct session_info_t { + rocmtools_session_id_t id{}; + std::chrono::time_point start_time; + std::chrono::time_point stop_time; + } session_info_t; public: - RdcRocpBase(); - RdcRocpBase(const RdcRocpBase&) = default; - RdcRocpBase(RdcRocpBase&&) = delete; - RdcRocpBase& operator=(const RdcRocpBase&) = delete; - RdcRocpBase& operator=(RdcRocpBase&&) = delete; - ~RdcRocpBase(); + RdcRocpBase(); + RdcRocpBase(const RdcRocpBase&) = default; + RdcRocpBase(RdcRocpBase&&) = delete; + RdcRocpBase& operator=(const RdcRocpBase&) = delete; + RdcRocpBase& operator=(RdcRocpBase&&) = delete; + ~RdcRocpBase(); - /** - * @brief Lookup ROCProfiler counter - * - * @param[in] field An existing field already added to sessions dictionary - * @param[out] value A pointer that will be populated with returned value - * - * @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed - * successfully. - */ - rdc_status_t rocp_lookup(pair_gpu_field_t gpu_field, double* value); + /** + * @brief Lookup ROCProfiler counter + * + * @param[in] field An existing field already added to sessions dictionary + * @param[out] value A pointer that will be populated with returned value + * + * @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed + * successfully. + */ + rdc_status_t rocp_lookup(pair_gpu_field_t gpu_field, double* value); - /** - * @brief Destroy ROCmTools session responsible for monitoring a given - * field - * - * @details While rocmtools supports multiple fields per ID - it has a - * limit to how many counters it can query internally. - * To avoid concerning ourselves with said limit, we limit each session to - * 1 field. - * In the future this can be optimized to allow for multiple fields per - * session. - * - * @param[in] field A field to start monitoring - * - * @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed - * successfully. - */ - rdc_status_t create_session(pair_gpu_field_t gpu_field); + /** + * @brief Destroy ROCmTools session responsible for monitoring a given + * field + * + * @details While rocmtools supports multiple fields per ID - it has a + * limit to how many counters it can query internally. + * To avoid concerning ourselves with said limit, we limit each session to + * 1 field. + * In the future this can be optimized to allow for multiple fields per + * session. + * + * @param[in] field A field to start monitoring + * + * @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed + * successfully. + */ + rdc_status_t create_session(pair_gpu_field_t gpu_field); - /** - * @brief Destroy ROCmTools session responsible for monitoring a given - * field - * - * @param[in] field A field to stop monitoring - * - * @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed - * successfully. - */ - rdc_status_t destroy_session(pair_gpu_field_t gpu_field); + /** + * @brief Destroy ROCmTools session responsible for monitoring a given + * field + * + * @param[in] field A field to stop monitoring + * + * @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed + * successfully. + */ + rdc_status_t destroy_session(pair_gpu_field_t gpu_field); protected: private: - std::map sessions; + std::map sessions; - /** - * @brief Convert from rocmtools status into RDC status - */ - rdc_status_t Rocp2RdcError(rocmtools_status_t rocm_status); + /** + * @brief Convert from rocmtools status into RDC status + */ + rdc_status_t Rocp2RdcError(rocmtools_status_t rocm_status); }; } // namespace rdc diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/ComputeQueueTest.h b/projects/rdc/include/rdc_modules/rdc_rocr/ComputeQueueTest.h old mode 100755 new mode 100644 index ff7b65b254..ace66a5e35 --- a/projects/rdc/include/rdc_modules/rdc_rocr/ComputeQueueTest.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/ComputeQueueTest.h @@ -22,8 +22,8 @@ THE SOFTWARE. #ifndef RDC_MODULES_RDC_ROCR_COMPUTEQUEUETEST_H_ #define RDC_MODULES_RDC_ROCR_COMPUTEQUEUETEST_H_ -#include "rdc_modules/rdc_rocr/TestBase.h" #include "hsa/hsa.h" +#include "rdc_modules/rdc_rocr/TestBase.h" namespace amd { namespace rdc { @@ -65,11 +65,10 @@ typedef struct BinarySearch { // Other items we need to populate AQL packet uint64_t kernel_object; - uint32_t group_segment_size; ///< Kernel group seg size - uint32_t private_segment_size; ///< Kernel private seg size + uint32_t group_segment_size; ///< Kernel group seg size + uint32_t private_segment_size; ///< Kernel private seg size } BinarySearch; - class ComputeQueueTest : public TestBase { public: explicit ComputeQueueTest(uint32_t gpu_index); @@ -101,14 +100,12 @@ class ComputeQueueTest : public TestBase { hsa_status_t LoadKernelFromObjFile(BinarySearch* bs); hsa_status_t Run(BinarySearch* bs); hsa_status_t CleanUp(BinarySearch* bs); - void PopulateAQLPacket(BinarySearch const* bs, - hsa_kernel_dispatch_packet_t* aql); - hsa_status_t AgentMemcpy(void* dst, const void* src, - size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag); - hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args, - size_t arg_size, void** aql_buf_ptr); - void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, - hsa_queue_t* q); + void PopulateAQLPacket(BinarySearch const* bs, hsa_kernel_dispatch_packet_t* aql); + hsa_status_t AgentMemcpy(void* dst, const void* src, size_t size, hsa_agent_t dst_ag, + hsa_agent_t src_ag); + hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args, size_t arg_size, + void** aql_buf_ptr); + void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, hsa_queue_t* q); }; } // namespace rdc diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/MemoryAccess.h b/projects/rdc/include/rdc_modules/rdc_rocr/MemoryAccess.h old mode 100755 new mode 100644 index 4d1b3b7267..bd5fe76c86 --- a/projects/rdc/include/rdc_modules/rdc_rocr/MemoryAccess.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/MemoryAccess.h @@ -23,9 +23,8 @@ THE SOFTWARE. #ifndef RDC_MODULES_RDC_ROCR_MEMORYACCESS_H_ #define RDC_MODULES_RDC_ROCR_MEMORYACCESS_H_ - -#include "rdc_modules/rdc_rocr/TestBase.h" #include "hsa/hsa.h" +#include "rdc_modules/rdc_rocr/TestBase.h" namespace amd { namespace rdc { @@ -51,18 +50,15 @@ class MemoryAccessTest : public TestBase { // @Brief: Display information about what this test does virtual void DisplayTestInfo(void); - // @Brief: This test verify that CPU is able to Read & write GPU memory void CPUAccessToGPUMemoryTest(void); // @Brief: This test verify that GPU is able to Read & write CPU memory void GPUAccessToCPUMemoryTest(void); - private: - void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, - hsa_agent_t gpuAgent, - hsa_amd_memory_pool_t pool); + void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent, + hsa_amd_memory_pool_t pool); void GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent); }; diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/MemoryTest.h b/projects/rdc/include/rdc_modules/rdc_rocr/MemoryTest.h old mode 100755 new mode 100644 index 5813a8835e..37edee94fe --- a/projects/rdc/include/rdc_modules/rdc_rocr/MemoryTest.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/MemoryTest.h @@ -22,8 +22,8 @@ THE SOFTWARE. #ifndef RDC_MODULES_RDC_ROCR_MEMORYTEST_H_ #define RDC_MODULES_RDC_ROCR_MEMORYTEST_H_ -#include "rdc_modules/rdc_rocr/TestBase.h" #include "hsa/hsa.h" +#include "rdc_modules/rdc_rocr/TestBase.h" namespace amd { namespace rdc { @@ -54,8 +54,7 @@ class MemoryTest : public TestBase { hsa_status_t TestAllocate(hsa_amd_memory_pool_t pool, size_t sz); private: - hsa_status_t MaxSingleAllocationTest(hsa_agent_t ag, - hsa_amd_memory_pool_t pool); + hsa_status_t MaxSingleAllocationTest(hsa_agent_t ag, hsa_amd_memory_pool_t pool); }; } // namespace rdc diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/RdcDiagnosticLib.h b/projects/rdc/include/rdc_modules/rdc_rocr/RdcDiagnosticLib.h index 652915a27b..f73fb6dc13 100644 --- a/projects/rdc/include/rdc_modules/rdc_rocr/RdcDiagnosticLib.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/RdcDiagnosticLib.h @@ -24,5 +24,4 @@ THE SOFTWARE. #include "rdc/rdc.h" #include "rdc_lib/RdcDiagnosticLibInterface.h" - #endif // RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_ diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/RdcRocrBase.h b/projects/rdc/include/rdc_modules/rdc_rocr/RdcRocrBase.h index 84cac0d8a1..773497da23 100644 --- a/projects/rdc/include/rdc_modules/rdc_rocr/RdcRocrBase.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/RdcRocrBase.h @@ -24,10 +24,12 @@ THE SOFTWARE. #define RDC_MODULES_RDC_ROCR_RDCROCRBASE_H_ #include #include + #include -#include "rdc_lib/RdcPerfTimer.h" + #include "hsa/hsa.h" #include "hsa/hsa_ext_amd.h" +#include "rdc_lib/RdcPerfTimer.h" namespace amd { namespace rdc { @@ -41,226 +43,134 @@ class RdcRocrBase { ///< Setters and Getters - void set_gpu_device1(hsa_agent_t in_dev) { - gpu_device1_.handle = in_dev.handle; - } - hsa_agent_t* gpu_device1(void) { - return &gpu_device1_; - } + void set_gpu_device1(hsa_agent_t in_dev) { gpu_device1_.handle = in_dev.handle; } + hsa_agent_t* gpu_device1(void) { return &gpu_device1_; } - void set_cpu_device(hsa_agent_t in_dev) { - cpu_device_.handle = in_dev.handle; - } - hsa_agent_t* cpu_device(void) { - return &cpu_device_; - } + void set_cpu_device(hsa_agent_t in_dev) { cpu_device_.handle = in_dev.handle; } + hsa_agent_t* cpu_device(void) { return &cpu_device_; } - void set_kernel_file_name(const char* in_file_name) { - kernel_file_name_ = in_file_name; - } - std::string const kernel_file_name(void) const { - return kernel_file_name_; - } + void set_kernel_file_name(const char* in_file_name) { kernel_file_name_ = in_file_name; } + std::string const kernel_file_name(void) const { return kernel_file_name_; } - void set_kernel_name(std::string in_kernel_name) { - kernel_name_ = in_kernel_name; - } - std::string const kernel_name(void) const { - return kernel_name_; - } + void set_kernel_name(std::string in_kernel_name) { kernel_name_ = in_kernel_name; } + std::string const kernel_name(void) const { return kernel_name_; } - void set_agent_name(std::string in_agent_name) { - agent_name_ = in_agent_name; - } + void set_agent_name(std::string in_agent_name) { agent_name_ = in_agent_name; } - std::string const get_agent_name(void) const { - return agent_name_; - } + std::string const get_agent_name(void) const { return agent_name_; } - void set_kernel_object(uint64_t in_kernel_object) { - kernel_object_ = in_kernel_object; - } - uint64_t kernel_object(void) const { - return kernel_object_; - } + void set_kernel_object(uint64_t in_kernel_object) { kernel_object_ = in_kernel_object; } + uint64_t kernel_object(void) const { return kernel_object_; } - void set_profile(hsa_profile_t in_prof) { - profile_ = in_prof; - } - hsa_profile_t profile(void) const { - return profile_; - } + void set_profile(hsa_profile_t in_prof) { profile_ = in_prof; } + hsa_profile_t profile(void) const { return profile_; } - uint32_t private_segment_size(void) const { - return private_segment_size_; - } - void set_private_segment_size(uint32_t sz) { - private_segment_size_ = sz; - } + uint32_t private_segment_size(void) const { return private_segment_size_; } + void set_private_segment_size(uint32_t sz) { private_segment_size_ = sz; } - void set_group_segment_size(uint32_t sz) { - group_segment_size_ = sz; - } - uint32_t group_segment_size(void) const { - return group_segment_size_; - } + void set_group_segment_size(uint32_t sz) { group_segment_size_ = sz; } + uint32_t group_segment_size(void) const { return group_segment_size_; } - void set_group_size(uint32_t sz) { - group_size_ = sz; - } - uint32_t group_size(void) const { - return group_size_; - } + void set_group_size(uint32_t sz) { group_size_ = sz; } + uint32_t group_size(void) const { return group_size_; } - void set_main_queue(hsa_queue_t* q) { - main_queue_ = q; - } - hsa_queue_t* main_queue(void) const { - return main_queue_; - } + void set_main_queue(hsa_queue_t* q) { main_queue_ = q; } + hsa_queue_t* main_queue(void) const { return main_queue_; } - hsa_kernel_dispatch_packet_t& aql(void) { - return aql_; - } + hsa_kernel_dispatch_packet_t& aql(void) { return aql_; } - void set_num_iteration(int num) { - num_iteration_ = num; - } - uint32_t num_iteration(void) const { - return num_iteration_; - } + void set_num_iteration(int num) { num_iteration_ = num; } + uint32_t num_iteration(void) const { return num_iteration_; } - hsa_amd_memory_pool_t& device_pool(void) { - return device_pool_; - } + hsa_amd_memory_pool_t& device_pool(void) { return device_pool_; } - hsa_amd_memory_pool_t& cpu_pool(void) { - return cpu_pool_; - } + hsa_amd_memory_pool_t& cpu_pool(void) { return cpu_pool_; } - hsa_amd_memory_pool_t& kern_arg_pool(void) { - return kern_arg_pool_; - } + hsa_amd_memory_pool_t& kern_arg_pool(void) { return kern_arg_pool_; } - void set_kernarg_size(uint32_t sz) { - kernarg_size_ = sz; - } - uint32_t kernarg_size(void) const { - return kernarg_size_; - } + void set_kernarg_size(uint32_t sz) { kernarg_size_ = sz; } + uint32_t kernarg_size(void) const { return kernarg_size_; } - void set_kernarg_align(uint32_t align) { - kernarg_align_ = align; - } - uint32_t kernarg_align(void) const { - return kernarg_align_; - } + void set_kernarg_align(uint32_t align) { kernarg_align_ = align; } + uint32_t kernarg_align(void) const { return kernarg_align_; } - void* kernarg_buffer(void) const { - return kernarg_buffer_; - } - void set_kernarg_buffer(void* buffer) { - kernarg_buffer_ = buffer; - } + void* kernarg_buffer(void) const { return kernarg_buffer_; } + void set_kernarg_buffer(void* buffer) { kernarg_buffer_ = buffer; } - int32_t requires_profile(void) const { - return requires_profile_; - } + int32_t requires_profile(void) const { return requires_profile_; } - char* orig_hsa_enable_interrupt() const { - return orig_hsa_enable_interrupt_; - } + char* orig_hsa_enable_interrupt() const { return orig_hsa_enable_interrupt_; } - bool enable_interrupt() const { - return enable_interrupt_; - } + bool enable_interrupt() const { return enable_interrupt_; } - void set_title(std::string name) { - title_ = name; - } - std::string title(void) const { - return title_; - } + void set_title(std::string name) { title_ = name; } + std::string title(void) const { return title_; } - RdcPerfTimer* hsa_timer(void) { - return &hsa_timer_; - } + RdcPerfTimer* hsa_timer(void) { return &hsa_timer_; } - void set_verbosity(uint32_t v) { - verbosity_ = v; - } - uint32_t verbosity(void) const { - return verbosity_; - } + void set_verbosity(uint32_t v) { verbosity_ = v; } + uint32_t verbosity(void) const { return verbosity_; } - void set_monitor_verbosity(uint32_t m) { - monitor_verbosity_ = m; - } - uint32_t monitor_verbosity(void) const { - return monitor_verbosity_; - } + void set_monitor_verbosity(uint32_t m) { monitor_verbosity_ = m; } + uint32_t monitor_verbosity(void) const { return monitor_verbosity_; } protected: - void set_requires_profile(int32_t reqd_prof) { - requires_profile_ = reqd_prof; - } + void set_requires_profile(int32_t reqd_prof) { requires_profile_ = reqd_prof; } - void set_enable_interrupt(bool doEnable) { - enable_interrupt_ = doEnable; - } + void set_enable_interrupt(bool doEnable) { enable_interrupt_ = doEnable; } private: - uint64_t num_iteration_; ///< Number of times to execute test + uint64_t num_iteration_; ///< Number of times to execute test - hsa_queue_t* main_queue_; ///< AQL queue used for packets + hsa_queue_t* main_queue_; ///< AQL queue used for packets - hsa_agent_t gpu_device1_; ///< Handle to first GPU found + hsa_agent_t gpu_device1_; ///< Handle to first GPU found - hsa_agent_t cpu_device_; ///< Handle to CPU + hsa_agent_t cpu_device_; ///< Handle to CPU - hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list + hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list - hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list + hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list - hsa_amd_memory_pool_t kern_arg_pool_; ///< Memory pool suitable for args + hsa_amd_memory_pool_t kern_arg_pool_; ///< Memory pool suitable for args - uint64_t kernel_object_; ///< Handle to kernel code + uint64_t kernel_object_; ///< Handle to kernel code - std::string kernel_file_name_; ///< Code object file name + std::string kernel_file_name_; ///< Code object file name - std::string kernel_name_; ///< Kernel name + std::string kernel_name_; ///< Kernel name - std::string agent_name_; ///< Agent name + std::string agent_name_; ///< Agent name - hsa_kernel_dispatch_packet_t aql_; ///< Kernel dispatch packet + hsa_kernel_dispatch_packet_t aql_; ///< Kernel dispatch packet - uint32_t group_segment_size_; ///< Kernel group seg size + uint32_t group_segment_size_; ///< Kernel group seg size - uint32_t kernarg_size_; ///< Kernarg memory size + uint32_t kernarg_size_; ///< Kernarg memory size - uint32_t kernarg_align_; ///< Alignment for kern argument memory + uint32_t kernarg_align_; ///< Alignment for kern argument memory - void* kernarg_buffer_; ///< Unaligned allocated kernel arg. buffer + void* kernarg_buffer_; ///< Unaligned allocated kernel arg. buffer - hsa_profile_t profile_; ///< Device profile. + hsa_profile_t profile_; ///< Device profile. - uint32_t group_size_; ///< Number of work items in one group + uint32_t group_size_; ///< Number of work items in one group - uint32_t private_segment_size_; ///< Kernel private seg size + uint32_t private_segment_size_; ///< Kernel private seg size - int32_t requires_profile_; ///< Profile required by test (-1 if no req.) + int32_t requires_profile_; ///< Profile required by test (-1 if no req.) - char* orig_hsa_enable_interrupt_; ///< Orig. value of HSA_ENABLE_INTERRUPT + char* orig_hsa_enable_interrupt_; ///< Orig. value of HSA_ENABLE_INTERRUPT - bool enable_interrupt_; ///< Whether to enable/disable interrupts for test + bool enable_interrupt_; ///< Whether to enable/disable interrupts for test - std::string title_; ///< Displayed title of test + std::string title_; ///< Displayed title of test - uint32_t verbosity_; ///< How much additional output to produce + uint32_t verbosity_; ///< How much additional output to produce - uint32_t monitor_verbosity_; ///< verbose or not + uint32_t monitor_verbosity_; ///< verbose or not - RdcPerfTimer hsa_timer_; ///< Timer to be used for timing parts of test + RdcPerfTimer hsa_timer_; ///< Timer to be used for timing parts of test }; } // namespace rdc diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h b/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h old mode 100755 new mode 100644 index e0787828d0..96dc669a24 --- a/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h @@ -22,9 +22,10 @@ THE SOFTWARE. #ifndef RDC_MODULES_RDC_ROCR_TESTBASE_H_ #define RDC_MODULES_RDC_ROCR_TESTBASE_H_ -#include #include +#include #include + #include "rdc_modules/rdc_rocr/RdcRocrBase.h" namespace amd { @@ -36,7 +37,7 @@ class TestBase : public RdcRocrBase { virtual ~TestBase(void); - enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS}; + enum VerboseLevel { VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS }; // @Brief: Before run the core measure codes, do something to set up // i.e. init runtime, prepare packet... @@ -54,12 +55,12 @@ class TestBase : public RdcRocrBase { // @Brief: Display information about the test virtual void DisplayTestInfo(void); - const std::string & description(void) const {return description_;} + const std::string& description(void) const { return description_; } void set_description(std::string d); - const std::string & get_gpu_info() const { return gpu_info_;} - const std::string & get_per_gpu_info() const { return per_gpu_info_;} + const std::string& get_gpu_info() const { return gpu_info_; } + const std::string& get_per_gpu_info() const { return per_gpu_info_; } hsa_status_t FindGPUIndex(hsa_agent_t agent, void* data); // Return the agent by GPU index in rocm_smi diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/base_rocr_utils.h b/projects/rdc/include/rdc_modules/rdc_rocr/base_rocr_utils.h old mode 100755 new mode 100644 index bd3ffaa176..ed05b5f9a3 --- a/projects/rdc/include/rdc_modules/rdc_rocr/base_rocr_utils.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/base_rocr_utils.h @@ -26,10 +26,11 @@ THE SOFTWARE. /// \file /// Prototypes of utility functions that act on RdcRocrBase objects. -#include "rdc_modules/rdc_rocr/RdcRocrBase.h" #include -#include "rdc_modules/rdc_rocr/common.h" + #include "hsa/hsa.h" +#include "rdc_modules/rdc_rocr/RdcRocrBase.h" +#include "rdc_modules/rdc_rocr/common.h" namespace amd { namespace rdc { @@ -58,8 +59,7 @@ hsa_status_t SetDefaultAgents(RdcRocrBase* test); /// \param[in] do_profile [Optional] Specificy whether profiled queue should /// be created /// \returns HSA_STATUS_SUCCESS if no errors encountered -hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, - uint32_t num_pkts = 0); +hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, uint32_t num_pkts = 0); /// This function sets some reasonable default values for an AQL packet. /// Override any field as necessary after calling this function. @@ -68,18 +68,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, /// \param[inout] aql Caller provided pointer to aql packet that will be /// populated /// \returns Appropriate hsa_status_t -hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, - hsa_kernel_dispatch_packet_t* aql); +hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, hsa_kernel_dispatch_packet_t* aql); /// This function writes all of the aql packet fields to the queue besides /// "setup" and "header". This assumes all the aql fields have be set /// appropriately. /// \param[in] test Test containing the queue and aql packet to be written. /// \returns Pointer to dispatch packet in queue that was written to -hsa_kernel_dispatch_packet_t* WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind); +hsa_kernel_dispatch_packet_t* WriteAQLToQueue(RdcRocrBase* test, uint64_t* ind); -void WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx, - hsa_kernel_dispatch_packet_t *aql_pkt); +void WriteAQLToQueueLoc(hsa_queue_t* queue, uint64_t indx, hsa_kernel_dispatch_packet_t* aql_pkt); /// This function writes the first 32 bits of an aql packet to the provided /// aql packet. This function is meant to be called immediately before /// ringing door_bell signal. @@ -89,9 +87,9 @@ void WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx, /// be written /// \returns void inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup, - hsa_kernel_dispatch_packet_t* queue_packet) { - __atomic_store_n(reinterpret_cast(queue_packet), - header | (setup <<16), __ATOMIC_RELEASE); + hsa_kernel_dispatch_packet_t* queue_packet) { + __atomic_store_n(reinterpret_cast(queue_packet), header | (setup << 16), + __ATOMIC_RELEASE); } /// Perform common operations to clean up after executing a test. Specifically, @@ -121,8 +119,7 @@ bool CheckProfile(RdcRocrBase const* test); /// \param arg_size Size of the kernel arg data (including padding) to be /// written /// \returns HSA_STATUS_SUCCESS if no errors -hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, - size_t arg_size); +hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, size_t arg_size); /// Verify that the machine running the test has the required profile. /// This function will verify that the execution machine meets any specific @@ -149,8 +146,9 @@ hsa_status_t SetPoolsTypical(RdcRocrBase* test); /// \param[in] test Test that has handles to cpu and gpu agents that can own /// either source or destination of fill /// \returns HSA_STATUS_OK if not errors -hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value, - size_t count, hsa_agent_t dst_ag, hsa_agent_t src_ag, RdcRocrBase* test); +hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value, size_t count, + hsa_agent_t dst_ag, hsa_agent_t src_ag, + RdcRocrBase* test); /// Get the library directory which is loaded by current process. /// It will search /proc/self/maps for it. @@ -162,11 +160,9 @@ std::string get_app_dir(); // Search multiple folder for the hsaco file // Return empty if cannot find it. -std::string search_hsaco_full_path(const char* hsaco_file_name, - const char* agent_name); +std::string search_hsaco_full_path(const char* hsaco_file_name, const char* agent_name); } // namespace rdc } // namespace amd #endif // RDC_MODULES_RDC_ROCR_BASE_ROCR_UTILS_H_ - diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/common.h b/projects/rdc/include/rdc_modules/rdc_rocr/common.h old mode 100755 new mode 100644 index 3b9ff89a6d..e0ea1f0349 --- a/projects/rdc/include/rdc_modules/rdc_rocr/common.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/common.h @@ -28,12 +28,13 @@ THE SOFTWARE. #include #include + #include #include #include -#include -#include #include +#include +#include #include "hsa/hsa.h" #include "hsa/hsa_ext_amd.h" @@ -45,13 +46,13 @@ namespace rdc { #define ALIGNED_(x) __declspec(align(x)) #else #if defined(__GNUC__) -#define ALIGNED_(x) __attribute__ ((aligned(x))) +#define ALIGNED_(x) __attribute__((aligned(x))) #endif // __GNUC__ #endif // _MSC_VER -#define MULTILINE(...) # __VA_ARGS__ +#define MULTILINE(...) #__VA_ARGS__ -#define ASSERT_EQ(a, b) (a==b) +#define ASSERT_EQ(a, b) (a == b) void SetEnv(const char* env_var_name, const char* env_var_value); intptr_t AlignDown(intptr_t value, size_t alignment); @@ -66,39 +67,35 @@ void* AlignUp(void* value, size_t alignment); // related calls, and is later used for reference when displaying the // information. typedef struct pool_info_t_ { - uint32_t segment; - size_t size; - bool alloc_allowed; - size_t alloc_granule; - size_t alloc_alignment; - bool accessible_by_all; - uint32_t global_flag; - uint64_t aggregate_alloc_max; - inline bool operator==(const pool_info_t_ &a) { - if (a.segment == segment && a.size == size - && a.alloc_allowed == alloc_allowed - && a.alloc_granule == alloc_granule - && a.alloc_alignment == alloc_alignment - && a.accessible_by_all == accessible_by_all - && a.aggregate_alloc_max == aggregate_alloc_max - && a.global_flag == global_flag ) - return true; - else - return false; - } + uint32_t segment; + size_t size; + bool alloc_allowed; + size_t alloc_granule; + size_t alloc_alignment; + bool accessible_by_all; + uint32_t global_flag; + uint64_t aggregate_alloc_max; + inline bool operator==(const pool_info_t_& a) { + if (a.segment == segment && a.size == size && a.alloc_allowed == alloc_allowed && + a.alloc_granule == alloc_granule && a.alloc_alignment == alloc_alignment && + a.accessible_by_all == accessible_by_all && a.aggregate_alloc_max == aggregate_alloc_max && + a.global_flag == global_flag) + return true; + else + return false; + } } pool_info_t; - -struct agent_pools_t{ - hsa_agent_t agent; - std::vector pools; +struct agent_pools_t { + hsa_agent_t agent; + std::vector pools; }; /// Fill in the pool_info_t structure for the provided pool. /// \param[in] pool Pool for which information will be retrieved /// \param[out] pool_i Pointer to structure where pool info will be stored /// \returns HSA_STATUS_SUCCESS if no errors are encountered. -hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, pool_info_t *pool_i); +hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, pool_info_t* pool_i); /// If the provided agent is associated with a GPU, return that agent through /// output parameter. This function is meant to be the call-back function used @@ -128,7 +125,7 @@ hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data); /// \param[out] data If agent is associated with a CPU, this pointer will point /// to the agent upon return /// \returns HSA_STATUS_SUCCESS if no errors are encountered. -hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data); +hsa_status_t IterateCPUAgents(hsa_agent_t agent, void* data); /// If the provided agent is associated with a GPU, return that agent through /// output parameter. This function is meant to be the call-back function used @@ -137,7 +134,7 @@ hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data); /// \param[out] data If agent is associated with a GPU, this pointer will point /// to the agent upon return /// \returns HSA_STATUS_SUCCESS if no errors are encountered. -hsa_status_t IterateGPUAgents(hsa_agent_t agent, void *data); +hsa_status_t IterateGPUAgents(hsa_agent_t agent, void* data); /// Find a GLOBAL memory pool. By this, we mean not a kernel args pool. /// This function is meant to be the call-back function used @@ -163,7 +160,6 @@ hsa_status_t GetGlobalMemoryPool(hsa_amd_memory_pool_t pool, void* data); /// -else return an appropriate error code for any error encountered hsa_status_t GetKernArgMemoryPool(hsa_amd_memory_pool_t pool, void* data); - /// Find a "standard" pool. By this, we mean not a kernel args pool. /// The pool found will have the following properties: /// HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL: Don't care @@ -201,16 +197,14 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data); /// \param[in] pool Pool to gather and dump information for /// \param[in] indent Number of spaces to indent output. /// \returns hsa_status_t HSA_STATUS_SUCCESS if no errors -hsa_status_t DumpMemoryPoolInfo(const pool_info_t *pool_i, - uint32_t indent = 0); +hsa_status_t DumpMemoryPoolInfo(const pool_info_t* pool_i, uint32_t indent = 0); /// Dump information about a provided pointer to STDOUT. /// \param[in] ptr Pointer about which information is dumped. /// \returns HSA_STATUS_SUCCESS if there are no errors hsa_status_t DumpPointerInfo(void* ptr); -hsa_status_t GetAgentPools( - std::vector> *agent_pools); +hsa_status_t GetAgentPools(std::vector>* agent_pools); void throw_if_error(hsa_status_t err, const std::string& msg = ""); @@ -219,10 +213,11 @@ void throw_if_skip(const std::string& msg); // The customize exception when the test has to be skipped class SkipException : public std::exception { public: - explicit SkipException(const char* msg): _msg(msg) {} - virtual const char* what() const noexcept { return _msg.c_str(); } + explicit SkipException(const char* msg) : _msg(msg) {} + virtual const char* what() const noexcept { return _msg.c_str(); } + private: - std::string _msg; + std::string _msg; }; } // namespace rdc diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index fb66fb77b7..1758e8b5d1 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -21,404 +21,370 @@ THE SOFTWARE. */ #include #include + #include + #include "common/rdc_fields_supported.h" #include "rdc/rdc.h" #include "rdc_lib/RdcHandler.h" +#include "rdc_lib/RdcLibraryLoader.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcLibraryLoader.h" - static amd::rdc::RdcLibraryLoader rdc_lib_loader; -rdc_status_t rdc_init(uint64_t) { - return RDC_ST_OK; -} +rdc_status_t rdc_init(uint64_t) { return RDC_ST_OK; } -rdc_status_t rdc_shutdown() { - return rdc_lib_loader.unload(); -} +rdc_status_t rdc_shutdown() { return rdc_lib_loader.unload(); } -rdc_status_t rdc_connect(const char* ipAddress, - rdc_handle_t* p_rdc_handle, - const char* root_ca, const char* client_cert, - const char* client_key ) { - amd::rdc::RdcHandler* (*func_make_handler)(const char*, - const char*, const char*, const char*) = nullptr; +rdc_status_t rdc_connect(const char* ipAddress, rdc_handle_t* p_rdc_handle, const char* root_ca, + const char* client_cert, const char* client_key) { + amd::rdc::RdcHandler* (*func_make_handler)(const char*, const char*, const char*, const char*) = + nullptr; - if (!ipAddress || !p_rdc_handle) { - return RDC_ST_FAIL_LOAD_MODULE; - } + if (!ipAddress || !p_rdc_handle) { + return RDC_ST_FAIL_LOAD_MODULE; + } - rdc_status_t status = rdc_lib_loader.load("librdc_client.so", - &func_make_handler); - if (status != RDC_ST_OK) { - *p_rdc_handle = nullptr; - return status; - } + rdc_status_t status = rdc_lib_loader.load("librdc_client.so", &func_make_handler); + if (status != RDC_ST_OK) { + *p_rdc_handle = nullptr; + return status; + } - *p_rdc_handle = static_cast - (func_make_handler(ipAddress, - root_ca, client_cert, client_key)); - return RDC_ST_OK; + *p_rdc_handle = + static_cast(func_make_handler(ipAddress, root_ca, client_cert, client_key)); + return RDC_ST_OK; } rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } - delete static_cast(p_rdc_handle); - p_rdc_handle = nullptr; - return RDC_ST_OK; + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + delete static_cast(p_rdc_handle); + p_rdc_handle = nullptr; + return RDC_ST_OK; } -rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, - rdc_handle_t* p_rdc_handle ) { - amd::rdc::RdcHandler* (*func_make_handler)(rdc_operation_mode_t) - = nullptr; - if (!p_rdc_handle) { - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, rdc_handle_t* p_rdc_handle) { + amd::rdc::RdcHandler* (*func_make_handler)(rdc_operation_mode_t) = nullptr; + if (!p_rdc_handle) { + return RDC_ST_FAIL_LOAD_MODULE; + } - rdc_status_t status = rdc_lib_loader.load("librdc.so", - &func_make_handler); - if (status != RDC_ST_OK) { - *p_rdc_handle = nullptr; - return status; - } + rdc_status_t status = rdc_lib_loader.load("librdc.so", &func_make_handler); + if (status != RDC_ST_OK) { + *p_rdc_handle = nullptr; + return status; + } - *p_rdc_handle = static_cast - (func_make_handler(op_mode)); + *p_rdc_handle = static_cast(func_make_handler(op_mode)); - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } - delete static_cast(p_rdc_handle); - p_rdc_handle = nullptr; - return RDC_ST_OK; + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + delete static_cast(p_rdc_handle); + p_rdc_handle = nullptr; + return RDC_ST_OK; } -rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, - uint32_t wait_for_update) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_update) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_field_update_all(wait_for_update); + return static_cast(p_rdc_handle)->rdc_field_update_all(wait_for_update); } -rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, - const char job_id[64], rdc_job_info_t* p_job_info) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, const char job_id[64], + rdc_job_info_t* p_job_info) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_job_get_stats(job_id, p_job_info); + return static_cast(p_rdc_handle)->rdc_job_get_stats(job_id, p_job_info); } -rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t groupId, const char job_id[64], - uint64_t update_freq) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, rdc_gpu_group_t groupId, + const char job_id[64], uint64_t update_freq) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_job_start_stats(groupId, job_id, update_freq); + return static_cast(p_rdc_handle) + ->rdc_job_start_stats(groupId, job_id, update_freq); } rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, const char job_id[64]) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_job_remove(job_id); + return static_cast(p_rdc_handle)->rdc_job_remove(job_id); } rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_job_remove_all(); + return static_cast(p_rdc_handle)->rdc_job_remove_all(); } +rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, const char job_id[64]) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } -rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, - const char job_id[64] ) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } - - return static_cast(p_rdc_handle)-> - rdc_job_stop_stats(job_id); + return static_cast(p_rdc_handle)->rdc_job_stop_stats(job_id); } -rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, - rdc_group_type_t type, const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, rdc_group_type_t type, + const char* group_name, rdc_gpu_group_t* p_rdc_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_gpu_create(type, group_name, p_rdc_group_id); + return static_cast(p_rdc_handle) + ->rdc_group_gpu_create(type, group_name, p_rdc_group_id); } -rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t groupId, uint32_t gpuIndex ) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t groupId, + uint32_t gpuIndex) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_gpu_add(groupId, gpuIndex); + return static_cast(p_rdc_handle)->rdc_group_gpu_add(groupId, gpuIndex); } rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle, - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { - if (!p_rdc_handle || !count) { - return RDC_ST_INVALID_HANDLER; - } + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { + if (!p_rdc_handle || !count) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_device_get_all(gpu_index_list, count); + return static_cast(p_rdc_handle) + ->rdc_device_get_all(gpu_index_list, count); } -rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, - uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { - if (!p_rdc_handle || !p_rdc_attr) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index, + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_handle || !p_rdc_attr) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_device_get_attributes(gpu_index, p_rdc_attr); + return static_cast(p_rdc_handle) + ->rdc_device_get_attributes(gpu_index, p_rdc_attr); } -rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, - uint32_t num_field_ids, rdc_field_t* field_ids, - const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { - if (!p_rdc_handle || !field_ids || - !field_group_name || !rdc_field_group_id) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, uint32_t num_field_ids, + rdc_field_t* field_ids, const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) { + if (!p_rdc_handle || !field_ids || !field_group_name || !rdc_field_group_id) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_field_create(num_field_ids, field_ids, - field_group_name, rdc_field_group_id); + return static_cast(p_rdc_handle) + ->rdc_group_field_create(num_field_ids, field_ids, field_group_name, rdc_field_group_id); } -rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id, + rdc_field_group_info_t* field_group_info) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_field_get_info(rdc_field_group_id, field_group_info); + return static_cast(p_rdc_handle) + ->rdc_group_field_get_info(rdc_field_group_id, field_group_info); } -rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) { - if (!p_rdc_handle || !p_rdc_group_info) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_handle || !p_rdc_group_info) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info); + return static_cast(p_rdc_handle) + ->rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info); } -rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id_list[], uint32_t* count) { - if (!p_rdc_handle || !count) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id_list[], + uint32_t* count) { + if (!p_rdc_handle || !count) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_get_all_ids(group_id_list, count); + return static_cast(p_rdc_handle) + ->rdc_group_get_all_ids(group_id_list, count); } rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle, - rdc_field_grp_t field_group_id_list[], uint32_t* count) { - if (!p_rdc_handle || !count) { - return RDC_ST_INVALID_HANDLER; - } + rdc_field_grp_t field_group_id_list[], uint32_t* count) { + if (!p_rdc_handle || !count) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_field_get_all_ids(field_group_id_list, count); + return static_cast(p_rdc_handle) + ->rdc_group_field_get_all_ids(field_group_id_list, count); } -rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, - uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id, uint64_t update_freq, + double max_keep_age, uint32_t max_keep_samples) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_field_watch(group_id, field_group_id, update_freq, - max_keep_age, max_keep_samples); + return static_cast(p_rdc_handle) + ->rdc_field_watch(group_id, field_group_id, update_freq, max_keep_age, max_keep_samples); } -rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, - uint32_t gpu_index, rdc_field_t field, rdc_field_value* value) { - if (!p_rdc_handle || !value) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, uint32_t gpu_index, + rdc_field_t field, rdc_field_value* value) { + if (!p_rdc_handle || !value) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_field_get_latest_value(gpu_index, field, value); + return static_cast(p_rdc_handle) + ->rdc_field_get_latest_value(gpu_index, field, value); } -rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, - uint32_t gpu_index, rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) { - if (!p_rdc_handle || !next_since_time_stamp || !value) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, uint32_t gpu_index, + rdc_field_t field, uint64_t since_time_stamp, + uint64_t* next_since_time_stamp, rdc_field_value* value) { + if (!p_rdc_handle || !next_since_time_stamp || !value) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_field_get_value_since(gpu_index, field, since_time_stamp, - next_since_time_stamp, value); + return static_cast(p_rdc_handle) + ->rdc_field_get_value_since(gpu_index, field, since_time_stamp, next_since_time_stamp, value); } -rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_field_unwatch(group_id, field_group_id); + return static_cast(p_rdc_handle) + ->rdc_field_unwatch(group_id, field_group_id); } -rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t p_rdc_group_id) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_gpu_destroy(p_rdc_group_id); + return static_cast(p_rdc_handle)->rdc_group_gpu_destroy(p_rdc_group_id); } rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, - rdc_field_grp_t rdc_field_group_id) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } + rdc_field_grp_t rdc_field_group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_group_field_destroy(rdc_field_group_id); + return static_cast(p_rdc_handle) + ->rdc_group_field_destroy(rdc_field_group_id); } -rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, - rdc_diag_level_t level, - rdc_diag_response_t* response) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_diag_level_t level, rdc_diag_response_t* response) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_diagnostic_run(group_id, level, response); + return static_cast(p_rdc_handle) + ->rdc_diagnostic_run(group_id, level, response); } -rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, - rdc_diag_test_result_t* result) { - if (!p_rdc_handle) { - return RDC_ST_INVALID_HANDLER; - } +rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } - return static_cast(p_rdc_handle)-> - rdc_test_case_run(group_id, test_case, result); + return static_cast(p_rdc_handle) + ->rdc_test_case_run(group_id, test_case, result); } const char* rdc_status_string(rdc_status_t result) { - switch (result) { - case RDC_ST_OK: - return "Success"; - case RDC_ST_NOT_SUPPORTED: - return "Not supported"; - case RDC_ST_FAIL_LOAD_MODULE: - return "Fail to load module"; - case RDC_ST_INVALID_HANDLER: - return "Invalid handler"; - case RDC_ST_NOT_FOUND: - return "Cannot find the value"; - case RDC_ST_BAD_PARAMETER: - return "Invalid parameters"; - case RDC_ST_MSI_ERROR: - return "SMI error"; - case RDC_ST_MAX_LIMIT: - return "The max limit reached"; - case RDC_ST_CONFLICT: - return "Conflict with current state"; - case RDC_ST_ALREADY_EXIST: - return "The value already exists"; - case RDC_ST_CLIENT_ERROR: - return "RDC Client error"; - case RDC_ST_INSUFF_RESOURCES: - return "Not enough resources to complete operation"; - case RDC_ST_FILE_ERROR: - return "Failed to access a file"; - case RDC_ST_NO_DATA: - return "Data was requested, but none was found"; - case RDC_ST_PERM_ERROR: - return "Insufficient permission to complete operation"; - case RDC_ST_UNKNOWN_ERROR: - return "Unknown error"; - default: - return "Unknown"; - } + switch (result) { + case RDC_ST_OK: + return "Success"; + case RDC_ST_NOT_SUPPORTED: + return "Not supported"; + case RDC_ST_FAIL_LOAD_MODULE: + return "Fail to load module"; + case RDC_ST_INVALID_HANDLER: + return "Invalid handler"; + case RDC_ST_NOT_FOUND: + return "Cannot find the value"; + case RDC_ST_BAD_PARAMETER: + return "Invalid parameters"; + case RDC_ST_MSI_ERROR: + return "SMI error"; + case RDC_ST_MAX_LIMIT: + return "The max limit reached"; + case RDC_ST_CONFLICT: + return "Conflict with current state"; + case RDC_ST_ALREADY_EXIST: + return "The value already exists"; + case RDC_ST_CLIENT_ERROR: + return "RDC Client error"; + case RDC_ST_INSUFF_RESOURCES: + return "Not enough resources to complete operation"; + case RDC_ST_FILE_ERROR: + return "Failed to access a file"; + case RDC_ST_NO_DATA: + return "Data was requested, but none was found"; + case RDC_ST_PERM_ERROR: + return "Insufficient permission to complete operation"; + case RDC_ST_UNKNOWN_ERROR: + return "Unknown error"; + default: + return "Unknown"; + } } const char* rdc_diagnostic_result_string(rdc_diag_result_t result) { - switch (result) { - case RDC_DIAG_RESULT_PASS: - return "Pass"; - case RDC_DIAG_RESULT_SKIP: - return "Skip"; - case RDC_DIAG_RESULT_WARN: - return "Warn"; - case RDC_DIAG_RESULT_FAIL: - return "Fail"; - default: - return "Unknown"; - } + switch (result) { + case RDC_DIAG_RESULT_PASS: + return "Pass"; + case RDC_DIAG_RESULT_SKIP: + return "Skip"; + case RDC_DIAG_RESULT_WARN: + return "Warn"; + case RDC_DIAG_RESULT_FAIL: + return "Fail"; + default: + return "Unknown"; + } } const char* field_id_string(rdc_field_t field_id) { - amd::rdc::fld_id2name_map_t &field_id_to_descript = - amd::rdc::get_field_id_description_from_id(); + amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id(); return field_id_to_descript.find(field_id)->second.label.c_str(); } rdc_field_t get_field_id_from_name(const char* name) { - rdc_field_t value; - if (amd::rdc::get_field_id_from_name(name, &value)) { - return value; - } - return RDC_FI_INVALID; + rdc_field_t value; + if (amd::rdc::get_field_id_from_name(name, &value)) { + return value; + } + return RDC_FI_INVALID; } -char *strncpy_with_null(char *dest, const char *src, size_t n) { - if (n == 0) { - return dest; - } - strncpy(dest, src, n - 1); - dest[n - 1]= '\0'; - return dest; +char* strncpy_with_null(char* dest, const char* src, size_t n) { + if (n == 0) { + return dest; + } + strncpy(dest, src, n - 1); + dest[n - 1] = '\0'; + return dest; } diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcLibraryLoader.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcLibraryLoader.cc index eb56133760..6b24c4cd7f 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcLibraryLoader.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcLibraryLoader.cc @@ -25,40 +25,37 @@ THE SOFTWARE. namespace amd { namespace rdc { -RdcLibraryLoader::RdcLibraryLoader(): libHandler_(nullptr) { -} +RdcLibraryLoader::RdcLibraryLoader() : libHandler_(nullptr) {} rdc_status_t RdcLibraryLoader::load(const char* filename) { - if (filename == nullptr) { - return RDC_ST_FAIL_LOAD_MODULE; - } - if (libHandler_) { - unload(); - } + if (filename == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } + if (libHandler_) { + unload(); + } - std::lock_guard guard(library_mutex_); - libHandler_ = dlopen(filename, RTLD_LAZY); - if (!libHandler_) { - char* error = dlerror(); - RDC_LOG(RDC_ERROR, "Fail to open " << filename <<": " << error); - return RDC_ST_FAIL_LOAD_MODULE; - } + std::lock_guard guard(library_mutex_); + libHandler_ = dlopen(filename, RTLD_LAZY); + if (!libHandler_) { + char* error = dlerror(); + RDC_LOG(RDC_ERROR, "Fail to open " << filename << ": " << error); + return RDC_ST_FAIL_LOAD_MODULE; + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcLibraryLoader::unload() { - std::lock_guard guard(library_mutex_); - if (libHandler_) { - dlclose(libHandler_); - libHandler_ = nullptr; - } - return RDC_ST_OK; + std::lock_guard guard(library_mutex_); + if (libHandler_) { + dlclose(libHandler_); + libHandler_ = nullptr; + } + return RDC_ST_OK; } -RdcLibraryLoader::~RdcLibraryLoader() { - unload(); -} +RdcLibraryLoader::~RdcLibraryLoader() { unload(); } } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc index f5c28b88d2..3b993fb44e 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc @@ -20,59 +20,59 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/RdcLogger.h" + #include #include -#include + +#include // NOLINT #include #include -#include // NOLINT +#include + #include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdcLogger::RdcLogger(std::ostream& os): - os_(os) { - char* verbose = getenv("RDC_LOG"); - if (verbose == nullptr) { - log_level_ = RDC_ERROR; - } else if (strcmp(verbose, "DEBUG") == 0) { - log_level_ = RDC_DEBUG; - } else if (strcmp(verbose, "INFO") == 0) { - log_level_ = RDC_INFO; - } else { - log_level_ = RDC_ERROR; - } +RdcLogger::RdcLogger(std::ostream& os) : os_(os) { + char* verbose = getenv("RDC_LOG"); + if (verbose == nullptr) { + log_level_ = RDC_ERROR; + } else if (strcmp(verbose, "DEBUG") == 0) { + log_level_ = RDC_DEBUG; + } else if (strcmp(verbose, "INFO") == 0) { + log_level_ = RDC_INFO; + } else { + log_level_ = RDC_ERROR; + } } -std::string RdcLogger::get_log_header(uint32_t severity, - const char* file, int line) { - std::stringstream strstream; - auto ms = std::chrono::duration_cast - (std::chrono::system_clock::now().time_since_epoch()).count(); - strstream << std::fixed << std::setprecision(3) << (ms/1000.0) << " "; - if (severity == RDC_DEBUG) { - strstream << "DEBUG "; - } else if (severity == RDC_INFO) { - strstream << "INFO "; - } else { - strstream << "ERROR "; - } +std::string RdcLogger::get_log_header(uint32_t severity, const char* file, int line) { + std::stringstream strstream; + auto ms = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + strstream << std::fixed << std::setprecision(3) << (ms / 1000.0) << " "; + if (severity == RDC_DEBUG) { + strstream << "DEBUG "; + } else if (severity == RDC_INFO) { + strstream << "INFO "; + } else { + strstream << "ERROR "; + } - // extract out the file path as it may be very long. - if (file != nullptr) { - std::string file_str(file); - auto found = file_str.find_last_of("/"); - if (found != std::string::npos) { - file_str = file_str.substr(found+1); - } - strstream << file_str << "(" << line << "): "; + // extract out the file path as it may be very long. + if (file != nullptr) { + std::string file_str(file); + auto found = file_str.find_last_of("/"); + if (found != std::string::npos) { + file_str = file_str.substr(found + 1); } + strstream << file_str << "(" << line << "): "; + } - return strstream.str(); + return strstream.str(); } - } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index 7b3ea60b10..acc8fd0a88 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -20,464 +20,433 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcCacheManagerImpl.h" + #include + #include #include #include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" - namespace amd { namespace rdc { -rdc_status_t RdcCacheManagerImpl::rdc_field_get_value_since( - uint32_t gpu_index, rdc_field_t field_id, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) { - if (!next_since_time_stamp || !value) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcCacheManagerImpl::rdc_field_get_value_since(uint32_t gpu_index, + rdc_field_t field_id, + uint64_t since_time_stamp, + uint64_t* next_since_time_stamp, + rdc_field_value* value) { + if (!next_since_time_stamp || !value) { + return RDC_ST_BAD_PARAMETER; + } - std::lock_guard guard(cache_mutex_); - RdcFieldKey field{gpu_index, field_id}; - auto cache_samples_ite = cache_samples_.find(field); - if (cache_samples_ite == cache_samples_.end() || - cache_samples_ite->second.size() == 0) { - return RDC_ST_NOT_FOUND; - } - - // TODO(bill_liu): Optimize it using the binary search - auto cache_values = cache_samples_ite->second; - for (auto cache_value=cache_values.begin(); - cache_value != cache_values.end(); cache_value++) { - if ( cache_value->last_time >= since_time_stamp ) { - // move to next potential timestamp - auto next_iter = std::next(cache_value); - if (next_iter != cache_values.end()) { - *next_since_time_stamp = next_iter->last_time; - } else { // Last item, set it to the future by adding 1us - *next_since_time_stamp = cache_value->last_time + 1; - } - value->ts = cache_value->last_time; - value->type = cache_value->type; - - if (value->type == STRING) { - strncpy_with_null(value->value.str, cache_value->value.str, - RDC_MAX_STR_LENGTH); - } else { - value->value.l_int = cache_value->value.l_int; - } - value->field_id = field_id; - return RDC_ST_OK; - } - } - - *next_since_time_stamp = since_time_stamp; + std::lock_guard guard(cache_mutex_); + RdcFieldKey field{gpu_index, field_id}; + auto cache_samples_ite = cache_samples_.find(field); + if (cache_samples_ite == cache_samples_.end() || cache_samples_ite->second.size() == 0) { return RDC_ST_NOT_FOUND; + } + + // TODO(bill_liu): Optimize it using the binary search + auto cache_values = cache_samples_ite->second; + for (auto cache_value = cache_values.begin(); cache_value != cache_values.end(); cache_value++) { + if (cache_value->last_time >= since_time_stamp) { + // move to next potential timestamp + auto next_iter = std::next(cache_value); + if (next_iter != cache_values.end()) { + *next_since_time_stamp = next_iter->last_time; + } else { // Last item, set it to the future by adding 1us + *next_since_time_stamp = cache_value->last_time + 1; + } + value->ts = cache_value->last_time; + value->type = cache_value->type; + + if (value->type == STRING) { + strncpy_with_null(value->value.str, cache_value->value.str, RDC_MAX_STR_LENGTH); + } else { + value->value.l_int = cache_value->value.l_int; + } + value->field_id = field_id; + return RDC_ST_OK; + } + } + + *next_since_time_stamp = since_time_stamp; + return RDC_ST_NOT_FOUND; } +rdc_status_t RdcCacheManagerImpl::evict_cache(uint32_t gpu_index, rdc_field_t field_id, + uint64_t max_keep_samples, double max_keep_age) { + std::lock_guard guard(cache_mutex_); -rdc_status_t RdcCacheManagerImpl::evict_cache(uint32_t gpu_index, - rdc_field_t field_id, uint64_t max_keep_samples, double max_keep_age) { - std::lock_guard guard(cache_mutex_); + RdcFieldKey field{gpu_index, field_id}; + auto cache_samples_ite = cache_samples_.find(field); + if (cache_samples_ite == cache_samples_.end() || cache_samples_ite->second.size() == 0) { + return RDC_ST_NOT_FOUND; + } - RdcFieldKey field{gpu_index, field_id}; - auto cache_samples_ite = cache_samples_.find(field); - if (cache_samples_ite == cache_samples_.end() || - cache_samples_ite->second.size() == 0) { - return RDC_ST_NOT_FOUND; + // Check max_keep_samples + auto& cache_values = cache_samples_ite->second; + int item_remove = cache_values.size() - max_keep_samples; + if (item_remove > 0) { + cache_values.erase(cache_values.begin(), cache_values.begin() + item_remove); + } + + // Check max_keep_age + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + + auto ite = cache_values.begin(); + while (ite != cache_values.end()) { + if (ite->last_time + max_keep_age * 1000 >= now) { + break; + } else { + ite = cache_values.erase(ite); } + } - // Check max_keep_samples - auto& cache_values = cache_samples_ite->second; - int item_remove = cache_values.size() - max_keep_samples; - if (item_remove > 0) { - cache_values.erase(cache_values.begin(), - cache_values.begin()+item_remove); - } - - // Check max_keep_age - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; - - auto ite = cache_values.begin(); - while (ite != cache_values.end()) { - if (ite->last_time + max_keep_age*1000 >= now) { - break; - } else { - ite = cache_values.erase(ite); - } - } - - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value( - uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { - if (!value) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value(uint32_t gpu_index, + rdc_field_t field_id, + rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } - std::lock_guard guard(cache_mutex_); - RdcFieldKey field{gpu_index, field_id}; - auto cache_samples_ite = cache_samples_.find(field); - if (cache_samples_ite == cache_samples_.end() || - cache_samples_ite->second.size() == 0) { - return RDC_ST_NOT_FOUND; - } + std::lock_guard guard(cache_mutex_); + RdcFieldKey field{gpu_index, field_id}; + auto cache_samples_ite = cache_samples_.find(field); + if (cache_samples_ite == cache_samples_.end() || cache_samples_ite->second.size() == 0) { + return RDC_ST_NOT_FOUND; + } - auto& cache_value = cache_samples_ite->second.back(); - value->ts = cache_value.last_time; - value->type = cache_value.type; - value->value = cache_value.value; - value->field_id = field_id; + auto& cache_value = cache_samples_ite->second.back(); + value->ts = cache_value.last_time; + value->type = cache_value.type; + value->value = cache_value.value; + value->field_id = field_id; - return RDC_ST_OK; + return RDC_ST_OK; } std::string RdcCacheManagerImpl::get_cache_stats() { - std::stringstream strstream; - std::lock_guard guard(cache_mutex_); + std::stringstream strstream; + std::lock_guard guard(cache_mutex_); - strstream << "Cache samples:"; - auto cache_samples_ite = cache_samples_.begin(); - for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) { - strstream << "<" << cache_samples_ite->first.first << "," - << cache_samples_ite->first.second << ":" - << cache_samples_ite->second.size() << "> "; - } + strstream << "Cache samples:"; + auto cache_samples_ite = cache_samples_.begin(); + for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) { + strstream << "<" << cache_samples_ite->first.first << "," << cache_samples_ite->first.second + << ":" << cache_samples_ite->second.size() << "> "; + } - strstream <<" Job caches:"; - auto job_ite = cache_jobs_.begin(); - for ( ; job_ite != cache_jobs_.end(); job_ite++ ) { - strstream << "<" << job_ite->first << ":" - << job_ite->second.gpu_stats.size() << "> "; - } + strstream << " Job caches:"; + auto job_ite = cache_jobs_.begin(); + for (; job_ite != cache_jobs_.end(); job_ite++) { + strstream << "<" << job_ite->first << ":" << job_ite->second.gpu_stats.size() << "> "; + } - return strstream.str(); + return strstream.str(); } rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index, - const rdc_field_value& value) { - RdcCacheEntry entry; - entry.last_time = value.ts; - entry.value = value.value; - entry.type = value.type; + const rdc_field_value& value) { + RdcCacheEntry entry; + entry.last_time = value.ts; + entry.value = value.value; + entry.type = value.type; - std::lock_guard guard(cache_mutex_); - RdcFieldKey field{gpu_index, value.field_id}; - auto cache_samples_ite = cache_samples_.find(field); - if (cache_samples_ite == cache_samples_.end()) { - std::vector ve; - ve.push_back(entry); - cache_samples_.insert({field, ve}); - } else { - cache_samples_ite->second.push_back(entry); - } + std::lock_guard guard(cache_mutex_); + RdcFieldKey field{gpu_index, value.field_id}; + auto cache_samples_ite = cache_samples_.find(field); + if (cache_samples_ite == cache_samples_.end()) { + std::vector ve; + ve.push_back(entry); + cache_samples_.insert({field, ve}); + } else { + cache_samples_ite->second.push_back(entry); + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcCacheManagerImpl::rdc_job_remove(const char job_id[64]) { - std::lock_guard guard(cache_mutex_); - cache_jobs_.erase(job_id); - return RDC_ST_OK; + std::lock_guard guard(cache_mutex_); + cache_jobs_.erase(job_id); + return RDC_ST_OK; } rdc_status_t RdcCacheManagerImpl::rdc_job_remove_all() { - std::lock_guard guard(cache_mutex_); - cache_jobs_.clear(); - return RDC_ST_OK; + std::lock_guard guard(cache_mutex_); + cache_jobs_.clear(); + return RDC_ST_OK; } rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index, - const std::string& job_id, const rdc_field_value& value) { - std::lock_guard guard(cache_mutex_); - auto job_iter = cache_jobs_.find(job_id); - if (job_iter == cache_jobs_.end()) { - return RDC_ST_NOT_FOUND; - } + const std::string& job_id, + const rdc_field_value& value) { + std::lock_guard guard(cache_mutex_); + auto job_iter = cache_jobs_.find(job_id); + if (job_iter == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } - auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index); - if (gpu_iter == job_iter->second.gpu_stats.end()) { - return RDC_ST_NOT_FOUND; - } + auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index); + if (gpu_iter == job_iter->second.gpu_stats.end()) { + return RDC_ST_NOT_FOUND; + } - auto fsummary = gpu_iter->second.field_summaries.find(value.field_id); - if (fsummary == gpu_iter->second.field_summaries.end()) { - return RDC_ST_NOT_FOUND; - } - if (fsummary->second.count == 0) { // first item - fsummary->second.count = 1; - fsummary->second.max_value = value.value.l_int; - fsummary->second.min_value = value.value.l_int; - fsummary->second.total_value = value.value.l_int; - fsummary->second.last_time = value.ts; - if (value.field_id == RDC_FI_POWER_USAGE) { - gpu_iter->second.energy_last_time = value.ts; - } - - // https://www.johndcook.com/blog/standard_deviation/ - fsummary->second.old_s = 0; - fsummary->second.old_m = fsummary->second.new_m = value.value.l_int; - return RDC_ST_OK; - } - if (value.field_id == RDC_FI_POWER_USAGE) { - uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time; - // Stored in cache as microseconds and microwats - gpu_iter->second.energy_consumed += - (time_elapsed * value.value.l_int)/(1000.0*1000000); - } - fsummary->second.max_value = std::max(fsummary->second.max_value, - static_cast(value.value.l_int)); - fsummary->second.min_value = std::min(fsummary->second.min_value, - static_cast(value.value.l_int)); - fsummary->second.total_value += value.value.l_int; + auto fsummary = gpu_iter->second.field_summaries.find(value.field_id); + if (fsummary == gpu_iter->second.field_summaries.end()) { + return RDC_ST_NOT_FOUND; + } + if (fsummary->second.count == 0) { // first item + fsummary->second.count = 1; + fsummary->second.max_value = value.value.l_int; + fsummary->second.min_value = value.value.l_int; + fsummary->second.total_value = value.value.l_int; fsummary->second.last_time = value.ts; - fsummary->second.count++; + if (value.field_id == RDC_FI_POWER_USAGE) { + gpu_iter->second.energy_last_time = value.ts; + } // https://www.johndcook.com/blog/standard_deviation/ - fsummary->second.new_m = fsummary->second.old_m + - (value.value.l_int - fsummary->second.old_m)/fsummary->second.count; - fsummary->second.new_s = fsummary->second.old_s + - (value.value.l_int - fsummary->second.old_m)* - (value.value.l_int - fsummary->second.new_m); - fsummary->second.old_m = fsummary->second.new_m; - fsummary->second.old_s = fsummary->second.new_s; - + fsummary->second.old_s = 0; + fsummary->second.old_m = fsummary->second.new_m = value.value.l_int; return RDC_ST_OK; + } + if (value.field_id == RDC_FI_POWER_USAGE) { + uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time; + // Stored in cache as microseconds and microwats + gpu_iter->second.energy_consumed += (time_elapsed * value.value.l_int) / (1000.0 * 1000000); + } + fsummary->second.max_value = + std::max(fsummary->second.max_value, static_cast(value.value.l_int)); + fsummary->second.min_value = + std::min(fsummary->second.min_value, static_cast(value.value.l_int)); + fsummary->second.total_value += value.value.l_int; + fsummary->second.last_time = value.ts; + fsummary->second.count++; + + // https://www.johndcook.com/blog/standard_deviation/ + fsummary->second.new_m = fsummary->second.old_m + + (value.value.l_int - fsummary->second.old_m) / fsummary->second.count; + fsummary->second.new_s = + fsummary->second.old_s + + (value.value.l_int - fsummary->second.old_m) * (value.value.l_int - fsummary->second.new_m); + fsummary->second.old_m = fsummary->second.new_m; + fsummary->second.old_s = fsummary->second.new_s; + + return RDC_ST_OK; } -void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, - rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary, - unsigned int adjuster) { - if (stats.count == 0) { - gpu.min_value = std::numeric_limits::max(); - gpu.max_value = gpu.average = 0; - return; - } +void RdcCacheManagerImpl::set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu, + rdc_stats_summary_t& summary, unsigned int adjuster) { + if (stats.count == 0) { + gpu.min_value = std::numeric_limits::max(); + gpu.max_value = gpu.average = 0; + return; + } - gpu.max_value = stats.max_value / adjuster; - gpu.min_value = stats.min_value / adjuster; - gpu.average = stats.total_value / stats.count / adjuster; - summary.max_value = std::max(summary.max_value, gpu.max_value); - summary.min_value = std::min(summary.min_value, gpu.min_value); - //< save total for future average calculation. - summary.average += gpu.average; + gpu.max_value = stats.max_value / adjuster; + gpu.min_value = stats.min_value / adjuster; + gpu.average = stats.total_value / stats.count / adjuster; + summary.max_value = std::max(summary.max_value, gpu.max_value); + summary.min_value = std::min(summary.min_value, gpu.min_value); + //< save total for future average calculation. + summary.average += gpu.average; - //< calculate the sample variance - gpu.standard_deviation = std::sqrt((stats.count > 1) - ? stats.new_s/(stats.count - 1) : 0.0)/adjuster; - summary.standard_deviation += gpu.standard_deviation; + //< calculate the sample variance + gpu.standard_deviation = + std::sqrt((stats.count > 1) ? stats.new_s / (stats.count - 1) : 0.0) / adjuster; + summary.standard_deviation += gpu.standard_deviation; } rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64], - const rdc_gpu_gauges_t& gpu_gauges, - rdc_job_info_t* p_job_info) { - std::lock_guard guard(cache_mutex_); - auto job_stats = cache_jobs_.find(jobId); + const rdc_gpu_gauges_t& gpu_gauges, + rdc_job_info_t* p_job_info) { + std::lock_guard guard(cache_mutex_); + auto job_stats = cache_jobs_.find(jobId); - if (job_stats == cache_jobs_.end()) { - return RDC_ST_NOT_FOUND; - } + if (job_stats == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } - //< Init the summary info - bool is_job_stopped = (job_stats->second.end_time != 0); - RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId); - auto& summary_info = p_job_info->summary; - summary_info.start_time = job_stats->second.start_time; - if (job_stats->second.end_time == 0) { - summary_info.end_time = time(nullptr); + //< Init the summary info + bool is_job_stopped = (job_stats->second.end_time != 0); + RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId); + auto& summary_info = p_job_info->summary; + summary_info.start_time = job_stats->second.start_time; + if (job_stats->second.end_time == 0) { + summary_info.end_time = time(nullptr); + } else { + summary_info.end_time = job_stats->second.end_time; + } + summary_info.energy_consumed = 0; + summary_info.max_gpu_memory_used = 0; + summary_info.ecc_correct = 0; + summary_info.ecc_uncorrect = 0; + summary_info.power_usage = {0, std::numeric_limits::max(), 0, 0}; + summary_info.pcie_tx = {0, std::numeric_limits::max(), 0, 0}; + summary_info.pcie_rx = {0, std::numeric_limits::max(), 0, 0}; + summary_info.gpu_temperature = {0, std::numeric_limits::max(), 0, 0}; + summary_info.memory_clock = {0, std::numeric_limits::max(), 0, 0}; + summary_info.gpu_clock = {0, std::numeric_limits::max(), 0, 0}; + summary_info.gpu_utilization = {0, std::numeric_limits::max(), 0, 0}; + summary_info.memory_utilization = {0, std::numeric_limits::max(), 0, 0}; + + p_job_info->num_gpus = job_stats->second.gpu_stats.size(); + + //< Populate information for each GPUs + + auto gpus = job_stats->second.gpu_stats.begin(); + for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { + auto& gpu_info = p_job_info->gpus[gpus->first]; + gpu_info.start_time = summary_info.start_time; + gpu_info.end_time = summary_info.end_time; + gpu_info.energy_consumed = gpus->second.energy_consumed; + summary_info.energy_consumed += gpu_info.energy_consumed; + + if (is_job_stopped) { + gpu_info.ecc_correct = gpus->second.ecc_correct_init; + summary_info.ecc_correct += gpu_info.ecc_correct; + } else if (gpu_gauges.find({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gpu_info.ecc_correct = + gpu_gauges.at({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - gpus->second.ecc_correct_init; + summary_info.ecc_correct += gpu_info.ecc_correct; } else { - summary_info.end_time = job_stats->second.end_time; + gpu_info.ecc_correct = 0; } - summary_info.energy_consumed = 0; - summary_info.max_gpu_memory_used = 0; - summary_info.ecc_correct = 0; - summary_info.ecc_uncorrect = 0; - summary_info.power_usage = {0, std::numeric_limits::max(), 0, 0}; - summary_info.pcie_tx = {0, std::numeric_limits::max(), 0, 0}; - summary_info.pcie_rx = {0, std::numeric_limits::max(), 0, 0}; - summary_info.gpu_temperature = - {0, std::numeric_limits::max(), 0, 0}; - summary_info.memory_clock = {0, std::numeric_limits::max(), 0, 0}; - summary_info.gpu_clock = {0, std::numeric_limits::max(), 0, 0}; - summary_info.gpu_utilization = - {0, std::numeric_limits::max(), 0, 0}; - summary_info.memory_utilization = {0, - std::numeric_limits::max(), 0, 0}; - p_job_info->num_gpus = job_stats->second.gpu_stats.size(); - - //< Populate information for each GPUs - - auto gpus = job_stats->second.gpu_stats.begin(); - for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { - auto & gpu_info = p_job_info->gpus[gpus->first]; - gpu_info.start_time = summary_info.start_time; - gpu_info.end_time = summary_info.end_time; - gpu_info.energy_consumed = gpus->second.energy_consumed; - summary_info.energy_consumed += gpu_info.energy_consumed; - - if (is_job_stopped) { - gpu_info.ecc_correct = gpus->second.ecc_correct_init; - summary_info.ecc_correct += gpu_info.ecc_correct; - } else if (gpu_gauges.find({gpus->first, - RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { - gpu_info.ecc_correct = gpu_gauges.at({ - gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - - gpus->second.ecc_correct_init; - summary_info.ecc_correct += gpu_info.ecc_correct; - } else { - gpu_info.ecc_correct = 0; - } - - if (is_job_stopped) { - gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init; - summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; - } else if (gpu_gauges.find({gpus->first, - RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { - gpu_info.ecc_uncorrect = gpu_gauges.at({ - gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - - gpus->second.ecc_uncorrect_init; - summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; - } else { - gpu_info.ecc_uncorrect = 0; - } - - if (gpu_gauges.find({gpus->first, - RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) { - RDC_LOG(RDC_ERROR, "Cannot find the total memory"); - return RDC_ST_BAD_PARAMETER; - } - uint64_t tmemory = gpu_gauges.at({gpus->first, - RDC_FI_GPU_MEMORY_TOTAL}); - - auto ite = gpus->second.field_summaries.begin(); - for (; ite != gpus->second.field_summaries.end(); ite++) { - if (ite->first == RDC_FI_POWER_USAGE) { - set_summary(ite->second, - gpu_info.power_usage, summary_info.power_usage, 1000000); - } else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) { - set_summary(ite->second, gpu_info.memory_utilization, - summary_info.memory_utilization, tmemory/100); - gpu_info.max_gpu_memory_used = ite->second.max_value; - summary_info.max_gpu_memory_used = std::max( - summary_info.max_gpu_memory_used, - gpu_info.max_gpu_memory_used); - } else if (ite->first == RDC_FI_GPU_CLOCK) { - set_summary(ite->second, gpu_info.gpu_clock, - summary_info.gpu_clock, 1000000); - } else if (ite->first == RDC_FI_GPU_UTIL) { - set_summary(ite->second, gpu_info.gpu_utilization, - summary_info.gpu_utilization, 1); - } else if (ite->first == RDC_FI_GPU_TEMP) { - set_summary(ite->second, - gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000); - } else if (ite->first == RDC_FI_MEM_CLOCK) { - set_summary(ite->second, - gpu_info.memory_clock, summary_info.memory_clock, 1000000); - } else if (ite->first == RDC_FI_PCIE_TX) { - set_summary(ite->second, - gpu_info.pcie_tx, summary_info.pcie_tx, 1024*1024); - } else if (ite->first == RDC_FI_PCIE_RX) { - set_summary(ite->second, - gpu_info.pcie_rx, summary_info.pcie_rx, 1024*1024); - } - } + if (is_job_stopped) { + gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init; + summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; + } else if (gpu_gauges.find({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gpu_info.ecc_uncorrect = gpu_gauges.at({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - + gpus->second.ecc_uncorrect_init; + summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; + } else { + gpu_info.ecc_uncorrect = 0; } - // Set the average of the summary - set_average_summary(summary_info.power_usage, p_job_info->num_gpus); - set_average_summary(summary_info.gpu_clock, p_job_info->num_gpus); - set_average_summary(summary_info.gpu_utilization, p_job_info->num_gpus); - set_average_summary(summary_info.memory_utilization, p_job_info->num_gpus); - set_average_summary(summary_info.pcie_tx, p_job_info->num_gpus); - set_average_summary(summary_info.pcie_rx, p_job_info->num_gpus); - set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus); - set_average_summary(summary_info.memory_clock, p_job_info->num_gpus); - return RDC_ST_OK; + if (gpu_gauges.find({gpus->first, RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) { + RDC_LOG(RDC_ERROR, "Cannot find the total memory"); + return RDC_ST_BAD_PARAMETER; + } + uint64_t tmemory = gpu_gauges.at({gpus->first, RDC_FI_GPU_MEMORY_TOTAL}); + + auto ite = gpus->second.field_summaries.begin(); + for (; ite != gpus->second.field_summaries.end(); ite++) { + if (ite->first == RDC_FI_POWER_USAGE) { + set_summary(ite->second, gpu_info.power_usage, summary_info.power_usage, 1000000); + } else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) { + set_summary(ite->second, gpu_info.memory_utilization, summary_info.memory_utilization, + tmemory / 100); + gpu_info.max_gpu_memory_used = ite->second.max_value; + summary_info.max_gpu_memory_used = + std::max(summary_info.max_gpu_memory_used, gpu_info.max_gpu_memory_used); + } else if (ite->first == RDC_FI_GPU_CLOCK) { + set_summary(ite->second, gpu_info.gpu_clock, summary_info.gpu_clock, 1000000); + } else if (ite->first == RDC_FI_GPU_UTIL) { + set_summary(ite->second, gpu_info.gpu_utilization, summary_info.gpu_utilization, 1); + } else if (ite->first == RDC_FI_GPU_TEMP) { + set_summary(ite->second, gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000); + } else if (ite->first == RDC_FI_MEM_CLOCK) { + set_summary(ite->second, gpu_info.memory_clock, summary_info.memory_clock, 1000000); + } else if (ite->first == RDC_FI_PCIE_TX) { + set_summary(ite->second, gpu_info.pcie_tx, summary_info.pcie_tx, 1024 * 1024); + } else if (ite->first == RDC_FI_PCIE_RX) { + set_summary(ite->second, gpu_info.pcie_rx, summary_info.pcie_rx, 1024 * 1024); + } + } + } + // Set the average of the summary + set_average_summary(summary_info.power_usage, p_job_info->num_gpus); + set_average_summary(summary_info.gpu_clock, p_job_info->num_gpus); + set_average_summary(summary_info.gpu_utilization, p_job_info->num_gpus); + set_average_summary(summary_info.memory_utilization, p_job_info->num_gpus); + set_average_summary(summary_info.pcie_tx, p_job_info->num_gpus); + set_average_summary(summary_info.pcie_rx, p_job_info->num_gpus); + set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus); + set_average_summary(summary_info.memory_clock, p_job_info->num_gpus); + + return RDC_ST_OK; } -void RdcCacheManagerImpl::set_average_summary( - rdc_stats_summary_t& summary, uint32_t num_gpus) { - summary.average = summary.average/num_gpus; - summary.standard_deviation = summary.standard_deviation/num_gpus; +void RdcCacheManagerImpl::set_average_summary(rdc_stats_summary_t& summary, uint32_t num_gpus) { + summary.average = summary.average / num_gpus; + summary.standard_deviation = summary.standard_deviation / num_gpus; } rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(const char job_id[64], - const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo, - const rdc_gpu_gauges_t& gpu_gauges) { - RdcJobStatsCacheEntry cacheEntry; - cacheEntry.start_time = std::time(nullptr); - cacheEntry.end_time = 0; - for (unsigned int i=0 ; i < ginfo.count; i++) { // GPUs - GpuSummaryStats gstats; - gstats.energy_consumed = 0; - gstats.energy_last_time = 0; - for (unsigned int j = 0; j < finfo.count; j++) { // init fields - FieldSummaryStats s; - s.count = 0; - s.max_value = s.min_value = s.total_value = 0; - gstats.field_summaries.insert({finfo.field_ids[j], s}); - } + const rdc_group_info_t& ginfo, + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) { + RdcJobStatsCacheEntry cacheEntry; + cacheEntry.start_time = std::time(nullptr); + cacheEntry.end_time = 0; + for (unsigned int i = 0; i < ginfo.count; i++) { // GPUs + GpuSummaryStats gstats; + gstats.energy_consumed = 0; + gstats.energy_last_time = 0; + for (unsigned int j = 0; j < finfo.count; j++) { // init fields + FieldSummaryStats s; + s.count = 0; + s.max_value = s.min_value = s.total_value = 0; + gstats.field_summaries.insert({finfo.field_ids[j], s}); + } - gstats.ecc_correct_init = 0; - if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) != - gpu_gauges.end()) { - gstats.ecc_correct_init = gpu_gauges.at( - {ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}); - } + gstats.ecc_correct_init = 0; + if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gstats.ecc_correct_init = gpu_gauges.at({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}); + } - gstats.ecc_uncorrect_init = 0; - if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) != - gpu_gauges.end()) { - gstats.ecc_uncorrect_init = gpu_gauges.at( - {ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}); - } + gstats.ecc_uncorrect_init = 0; + if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gstats.ecc_uncorrect_init = gpu_gauges.at({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}); + } - cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats}); - } + cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats}); + } - std::lock_guard guard(cache_mutex_); - // Remove the old stats if it exists - cache_jobs_.erase(job_id); - cache_jobs_.insert({job_id, cacheEntry}); - return RDC_ST_OK; + std::lock_guard guard(cache_mutex_); + // Remove the old stats if it exists + cache_jobs_.erase(job_id); + cache_jobs_.insert({job_id, cacheEntry}); + return RDC_ST_OK; } - rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauges) { - std::lock_guard guard(cache_mutex_); - auto job_stats = cache_jobs_.find(job_id); + const rdc_gpu_gauges_t& gpu_gauges) { + std::lock_guard guard(cache_mutex_); + auto job_stats = cache_jobs_.find(job_id); - if (job_stats == cache_jobs_.end()) { - return RDC_ST_NOT_FOUND; + if (job_stats == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + job_stats->second.end_time = std::time(nullptr); + + // update the ecc errors + auto gpus = job_stats->second.gpu_stats.begin(); + for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { + if (gpu_gauges.find({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gpus->second.ecc_correct_init = + gpu_gauges.at({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - gpus->second.ecc_correct_init; } - job_stats->second.end_time = std::time(nullptr); - - // update the ecc errors - auto gpus = job_stats->second.gpu_stats.begin(); - for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { - if (gpu_gauges.find({gpus->first, - RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { - gpus->second.ecc_correct_init = gpu_gauges.at({ - gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - - gpus->second.ecc_correct_init; - } - - if (gpu_gauges.find({gpus->first, - RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { - gpus->second.ecc_uncorrect_init = gpu_gauges.at({ - gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - - gpus->second.ecc_uncorrect_init; - } + if (gpu_gauges.find({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gpus->second.ecc_uncorrect_init = gpu_gauges.at({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - + gpus->second.ecc_uncorrect_init; } + } - return RDC_ST_OK; + return RDC_ST_OK; } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc b/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc index ef055a6c0e..c465c3abd9 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc @@ -20,132 +20,126 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcDiagnosticModule.h" + +#include #include #include #include -#include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcMetricFetcher.h" -#include "rdc_lib/impl/RdcSmiLib.h" #include "rdc_lib/impl/RdcRasLib.h" #include "rdc_lib/impl/RdcRocrLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" namespace amd { namespace rdc { rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) { - if (test_case_count == nullptr) { - return RDC_ST_BAD_PARAMETER; + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = diagnostic_modules_.begin(); + *test_case_count = 0; + for (; ite != diagnostic_modules_.end(); ite++) { + uint32_t count = 0; + rdc_status_t status = + (*ite)->rdc_diag_test_cases_query(&(test_cases[*test_case_count]), &count); + if (status == RDC_ST_OK) { + *test_case_count += count; } - auto ite = diagnostic_modules_.begin(); - *test_case_count = 0; - for (; ite != diagnostic_modules_.end(); ite++) { - uint32_t count = 0; - rdc_status_t status = (*ite)->rdc_diag_test_cases_query( - &(test_cases[*test_case_count]), &count); - if (status == RDC_ST_OK) { - *test_case_count += count; - } - } - return RDC_ST_OK; + } + return RDC_ST_OK; } -rdc_status_t RdcDiagnosticModule::rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - // Init test status - auto ite = testcases_to_module_.find(test_case); - if (ite == testcases_to_module_.end()) { - result->status = RDC_DIAG_RESULT_SKIP; - strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH); - return RDC_ST_NOT_SUPPORTED; - } - return ite->second->rdc_test_case_run(test_case, - gpu_index, gpu_count, result); + // Init test status + auto ite = testcases_to_module_.find(test_case); + if (ite == testcases_to_module_.end()) { + result->status = RDC_DIAG_RESULT_SKIP; + strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH); + return RDC_ST_NOT_SUPPORTED; + } + return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, result); } -rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) { - if (response == nullptr) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (response == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - std::vector rdc_runs; - if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above - rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS); - rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY); - rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS); - rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE); - rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK); - } + std::vector rdc_runs; + if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above + rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS); + rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY); + rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS); + rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE); + rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK); + } - response->results_count = 0; - for (unsigned int i=0; i < rdc_runs.size(); i++) { - response->diag_info[i].test_case = rdc_runs[i]; - rdc_test_case_run(rdc_runs[i], - const_cast(gpus.entity_ids), - gpus.count, &(response->diag_info[i])); - response->results_count++; - } + response->results_count = 0; + for (unsigned int i = 0; i < rdc_runs.size(); i++) { + response->diag_info[i].test_case = rdc_runs[i]; + rdc_test_case_run(rdc_runs[i], const_cast(gpus.entity_ids), gpus.count, + &(response->diag_info[i])); + response->results_count++; + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcDiagnosticModule::rdc_diag_init(uint64_t flag) { - auto ite = diagnostic_modules_.begin(); - for (; ite != diagnostic_modules_.end(); ite++) { - (*ite)->rdc_diag_init(flag); - } - return RDC_ST_OK; + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + (*ite)->rdc_diag_init(flag); + } + return RDC_ST_OK; } rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() { - auto ite = diagnostic_modules_.begin(); - for (; ite != diagnostic_modules_.end(); ite++) { - (*ite)->rdc_diag_destroy(); - } - return RDC_ST_OK; + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + (*ite)->rdc_diag_destroy(); + } + return RDC_ST_OK; } RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) { - const RdcSmiLibPtr smi_module = std::make_shared(fetcher); - const RdcRasLibPtr ras_module = std::make_shared(); - const RdcRocrLibPtr rocr_module = std::make_shared(); - if (smi_module) { - diagnostic_modules_.push_back(smi_module); - } - if (rocr_module) { - diagnostic_modules_.push_back(rocr_module); - } - if (ras_module) { - diagnostic_modules_.push_back(ras_module); - } + const RdcSmiLibPtr smi_module = std::make_shared(fetcher); + const RdcRasLibPtr ras_module = std::make_shared(); + const RdcRocrLibPtr rocr_module = std::make_shared(); + if (smi_module) { + diagnostic_modules_.push_back(smi_module); + } + if (rocr_module) { + diagnostic_modules_.push_back(rocr_module); + } + if (ras_module) { + diagnostic_modules_.push_back(ras_module); + } - auto ite = diagnostic_modules_.begin(); - for (; ite != diagnostic_modules_.end(); ite++) { - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES]; - uint32_t test_count = 0; - rdc_status_t status = (*ite)-> - rdc_diag_test_cases_query(test_cases, &test_count); - if (status == RDC_ST_OK) { - for (uint32_t index = 0; index < test_count; index++) { - testcases_to_module_.insert({test_cases[index], (*ite)}); - } - } + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES]; + uint32_t test_count = 0; + rdc_status_t status = (*ite)->rdc_diag_test_cases_query(test_cases, &test_count); + if (status == RDC_ST_OK) { + for (uint32_t index = 0; index < test_count; index++) { + testcases_to_module_.insert({test_cases[index], (*ite)}); + } } + } } - } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index cabffe0f95..0239ec1b04 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -19,48 +19,50 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include "rdc_lib/impl/RdcEmbeddedHandler.h" -#include "rdc_lib/impl/RdcMetricFetcherImpl.h" -#include "rdc_lib/impl/RdcGroupSettingsImpl.h" -#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" + +#include + +#include "common/rdc_fields_supported.h" +#include "rdc_lib/RdcException.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/RdcNotification.h" #include "rdc_lib/impl/RdcCacheManagerImpl.h" -#include "rdc_lib/impl/RdcWatchTableImpl.h" +#include "rdc_lib/impl/RdcGroupSettingsImpl.h" +#include "rdc_lib/impl/RdcMetricFetcherImpl.h" +#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" #include "rdc_lib/impl/RdcModuleMgrImpl.h" #include "rdc_lib/impl/RdcNotificationImpl.h" +#include "rdc_lib/impl/RdcWatchTableImpl.h" #include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcLogger.h" -#include "rdc_lib/RdcException.h" -#include "rdc_lib/RdcNotification.h" -#include "common/rdc_fields_supported.h" #include "rocm_smi/rocm_smi.h" namespace { // call the rsmi_init when load library // and rsmi_shutdown when unload the library. class rsmi_initializer { - rsmi_initializer() { - // Make sure rsmi will not be initialized multiple times - rsmi_shut_down(); - rsmi_status_t rsmi_ret = rsmi_init(0); - if (rsmi_ret != RSMI_STATUS_SUCCESS) { - throw amd::rdc::RdcException( - RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail"); - } - } - ~rsmi_initializer() { rsmi_shut_down();} + rsmi_initializer() { + // Make sure rsmi will not be initialized multiple times + rsmi_shut_down(); + rsmi_status_t rsmi_ret = rsmi_init(0); + if (rsmi_ret != RSMI_STATUS_SUCCESS) { + throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail"); + } + } + ~rsmi_initializer() { rsmi_shut_down(); } + public: - static rsmi_initializer& getInstance() { - static rsmi_initializer instance; - return instance; - } + static rsmi_initializer& getInstance() { + static rsmi_initializer instance; + return instance; + } }; static rsmi_initializer& in = rsmi_initializer::getInstance(); } // namespace -amd::rdc::RdcHandler *make_handler(rdc_operation_mode_t op_mode) { - return new amd::rdc::RdcEmbeddedHandler(op_mode); +amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode) { + return new amd::rdc::RdcEmbeddedHandler(op_mode); } namespace amd { @@ -69,368 +71,326 @@ namespace rdc { // TODO(bill_liu): make it configurable const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default -RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): - group_settings_(new RdcGroupSettingsImpl()) - , cache_mgr_(new RdcCacheManagerImpl()) - , metric_fetcher_(new RdcMetricFetcherImpl()) - , rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)) - , rdc_notif_(new RdcNotificationImpl()) - , watch_table_(new RdcWatchTableImpl(group_settings_, - cache_mgr_, rdc_module_mgr_, rdc_notif_)) - , metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, - METIC_UPDATE_FREQUENCY)) { - if (mode == RDC_OPERATION_MODE_AUTO) { - RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO"); - metrics_updater_->start(); - } +RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) + : group_settings_(new RdcGroupSettingsImpl()), + cache_mgr_(new RdcCacheManagerImpl()), + metric_fetcher_(new RdcMetricFetcherImpl()), + rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)), + rdc_notif_(new RdcNotificationImpl()), + watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)), + metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { + if (mode == RDC_OPERATION_MODE_AUTO) { + RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO"); + metrics_updater_->start(); + } } -RdcEmbeddedHandler::~RdcEmbeddedHandler() { - metrics_updater_->stop(); -} +RdcEmbeddedHandler::~RdcEmbeddedHandler() { metrics_updater_->stop(); } // JOB API -rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, - const char job_id[64], uint64_t update_freq) { - rdc_gpu_gauges_t gpu_gauges; - rdc_status_t status = get_gpu_gauges(&gpu_gauges); - if (status != RDC_ST_OK) return status; +rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64], + uint64_t update_freq) { + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; - return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq, - gpu_gauges); + return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq, gpu_gauges); } rdc_status_t RdcEmbeddedHandler::get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges) { - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; - uint32_t count = 0; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; - if (gpu_gauges == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - rdc_status_t status = rdc_device_get_all( - gpu_index_list, &count); + if (gpu_gauges == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + rdc_status_t status = rdc_device_get_all(gpu_index_list, &count); + if (status != RDC_ST_OK) { + return status; + } + + // Fetch total memory and current ecc errors + for (uint32_t i = 0; i < count; i++) { + rdc_field_value value; + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL, &value); if (status != RDC_ST_OK) { - return status; + RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " << gpu_index_list[i]); + return status; } + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL}, value.value.l_int}); - // Fetch total memory and current ecc errors - for (uint32_t i = 0; i < count ; i++) { - rdc_field_value value; - status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], - RDC_FI_GPU_MEMORY_TOTAL, &value); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " - << gpu_index_list[i]); - return status; - } - gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL}, - value.value.l_int}); - - status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], - RDC_FI_ECC_CORRECT_TOTAL, &value); - if (status == RDC_ST_OK) { - gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL}, - value.value.l_int}); - } - status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], - RDC_FI_ECC_UNCORRECT_TOTAL, &value); - if (status == RDC_ST_OK) { - gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL}, - value.value.l_int}); - } + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL, &value); + if (status == RDC_ST_OK) { + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL}, value.value.l_int}); } - return RDC_ST_OK; + status = + metric_fetcher_->fetch_smi_field(gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL, &value); + if (status == RDC_ST_OK) { + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL}, value.value.l_int}); + } + } + return RDC_ST_OK; } rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(const char job_id[64], - rdc_job_info_t* p_job_info) { - if (p_job_info == nullptr) { - return RDC_ST_BAD_PARAMETER; - } + rdc_job_info_t* p_job_info) { + if (p_job_info == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - rdc_gpu_gauges_t gpu_gauges; - rdc_status_t status = get_gpu_gauges(&gpu_gauges); - if (status != RDC_ST_OK) return status; + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; - return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info); + return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info); } rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(const char job_id[64]) { - rdc_gpu_gauges_t gpu_gauges; - rdc_status_t status = get_gpu_gauges(&gpu_gauges); - if (status != RDC_ST_OK) return status; + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; - return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges); + return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges); } rdc_status_t RdcEmbeddedHandler::rdc_job_remove(const char job_id[64]) { - return watch_table_->rdc_job_remove(job_id); + return watch_table_->rdc_job_remove(job_id); } - -rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() { - return watch_table_->rdc_job_remove_all(); -} +rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() { return watch_table_->rdc_job_remove_all(); } // Discovery API -rdc_status_t RdcEmbeddedHandler::rdc_device_get_all( - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; - } - rdc_field_value device_count; - rdc_status_t status = metric_fetcher_-> - fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count); - if (status != RDC_ST_OK) { - return status; - } +rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_count; + rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count); + if (status != RDC_ST_OK) { + return status; + } - // Assign the index to the index list - *count = device_count.value.l_int; - for (uint32_t i=0; i < *count; i++) { - gpu_index_list[i] = i; - } + // Assign the index to the index list + *count = device_count.value.l_int; + for (uint32_t i = 0; i < *count; i++) { + gpu_index_list[i] = i; + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index, - rdc_device_attributes_t* p_rdc_attr) { - if (!p_rdc_attr) { - return RDC_ST_BAD_PARAMETER; - } - rdc_field_value device_name; - rdc_status_t status = metric_fetcher_-> - fetch_smi_field(gpu_index, RDC_FI_DEV_NAME, &device_name); - strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, - RDC_MAX_STR_LENGTH); - return status; + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_attr) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_name; + rdc_status_t status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_DEV_NAME, &device_name); + strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, RDC_MAX_STR_LENGTH); + return status; } - // Group API -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) { - if (!group_name || !p_rdc_group_id) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) { + if (!group_name || !p_rdc_group_id) { + return RDC_ST_BAD_PARAMETER; + } - rdc_status_t status = group_settings_-> - rdc_group_gpu_create(group_name, p_rdc_group_id); - - if (status != RDC_ST_OK || type == RDC_GROUP_EMPTY) { - return status; - } - - // Add All GPUs to the group - uint32_t count = 0; - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; - status = rdc_device_get_all( - gpu_index_list, &count); - if (status != RDC_ST_OK) { - return status; - } - for (uint32_t i=0; i < count; i++) { - status = rdc_group_gpu_add(*p_rdc_group_id, gpu_index_list[i]); - } + rdc_status_t status = group_settings_->rdc_group_gpu_create(group_name, p_rdc_group_id); + if (status != RDC_ST_OK || type == RDC_GROUP_EMPTY) { return status; + } + + // Add All GPUs to the group + uint32_t count = 0; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + status = rdc_device_get_all(gpu_index_list, &count); + if (status != RDC_ST_OK) { + return status; + } + for (uint32_t i = 0; i < count; i++) { + status = rdc_group_gpu_add(*p_rdc_group_id, gpu_index_list[i]); + } + + return status; } -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, - uint32_t gpu_index) { - uint32_t count = 0; - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; - rdc_status_t status = rdc_device_get_all( - gpu_index_list, &count); - if (status != RDC_ST_OK) { - return status; - } - bool is_gpu_exist = false; - for (uint32_t i=0; i < count; i++) { - if (gpu_index_list[i] == gpu_index) { - is_gpu_exist = true; - break; - } +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, uint32_t gpu_index) { + uint32_t count = 0; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + rdc_status_t status = rdc_device_get_all(gpu_index_list, &count); + if (status != RDC_ST_OK) { + return status; + } + bool is_gpu_exist = false; + for (uint32_t i = 0; i < count; i++) { + if (gpu_index_list[i] == gpu_index) { + is_gpu_exist = true; + break; } + } - if (!is_gpu_exist) { - RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group " - << group_id <<" as the GPU index is invalid."); - return RDC_ST_NOT_FOUND; - } + if (!is_gpu_exist) { + RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group " << group_id + << " as the GPU index is invalid."); + return RDC_ST_NOT_FOUND; + } - return group_settings_->rdc_group_gpu_add(group_id, gpu_index); + return group_settings_->rdc_group_gpu_add(group_id, gpu_index); } rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids, - rdc_field_t* field_ids, const char* field_group_name, - rdc_field_grp_t* rdc_field_group_id) { - if (!field_group_name || !rdc_field_group_id || !field_ids) { - return RDC_ST_BAD_PARAMETER; - } + rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) { + if (!field_group_name || !rdc_field_group_id || !field_ids) { + return RDC_ST_BAD_PARAMETER; + } - // Check the field is valid or not - if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { - for (uint32_t i = 0; i < num_field_ids; i++) { - if (!is_field_valid(field_ids[i])) { - RDC_LOG(RDC_INFO, - "Fail to create field group with unknown field id " - << field_ids[i]); - return RDC_ST_NOT_SUPPORTED; - } - } - } else { - return RDC_ST_MAX_LIMIT; + // Check the field is valid or not + if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { + for (uint32_t i = 0; i < num_field_ids; i++) { + if (!is_field_valid(field_ids[i])) { + RDC_LOG(RDC_INFO, "Fail to create field group with unknown field id " << field_ids[i]); + return RDC_ST_NOT_SUPPORTED; + } } + } else { + return RDC_ST_MAX_LIMIT; + } - return group_settings_->rdc_group_field_create( - num_field_ids, field_ids, field_group_name, rdc_field_group_id); + return group_settings_->rdc_group_field_create(num_field_ids, field_ids, field_group_name, + rdc_field_group_id); } rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) { - if (!field_group_info) { - return RDC_ST_BAD_PARAMETER; - } - return group_settings_->rdc_group_field_get_info( - rdc_field_group_id, field_group_info); + rdc_field_grp_t rdc_field_group_id, rdc_field_group_info_t* field_group_info) { + if (!field_group_info) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_field_get_info(rdc_field_group_id, field_group_info); } -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_get_info( - rdc_gpu_group_t p_rdc_group_id, - rdc_group_info_t* p_rdc_group_info) { - if (!p_rdc_group_info) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_group_info) { + return RDC_ST_BAD_PARAMETER; + } - return group_settings_->rdc_group_gpu_get_info( - p_rdc_group_id, p_rdc_group_info); + return group_settings_->rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info); } -rdc_status_t RdcEmbeddedHandler::rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; - } - return group_settings_->rdc_group_get_all_ids(group_id_list, count); +rdc_status_t RdcEmbeddedHandler::rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_get_all_ids(group_id_list, count); } -rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; - } - return group_settings_->rdc_group_field_get_all_ids( - field_group_id_list, count); +rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + return group_settings_->rdc_group_field_get_all_ids(field_group_id_list, count); } - -rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) { - return group_settings_->rdc_group_gpu_destroy(p_rdc_group_id); +rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) { + return group_settings_->rdc_group_gpu_destroy(p_rdc_group_id); } -rdc_status_t RdcEmbeddedHandler::rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) { - return group_settings_->rdc_group_field_destroy(rdc_field_group_id); +rdc_status_t RdcEmbeddedHandler::rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) { + return group_settings_->rdc_group_field_destroy(rdc_field_group_id); } // Field API rdc_status_t RdcEmbeddedHandler::rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) { - return watch_table_->rdc_field_watch(group_id, field_group_id, - update_freq, max_keep_age, max_keep_samples); + rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) { + return watch_table_->rdc_field_watch(group_id, field_group_id, update_freq, max_keep_age, + max_keep_samples); } -rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value( - uint32_t gpu_index, rdc_field_t field, rdc_field_value* value) { - if (!value) { - return RDC_ST_BAD_PARAMETER; - } - if (!is_field_valid(field)) { - RDC_LOG(RDC_INFO, - "Fail to get latest value with unknown field id " - << field); - return RDC_ST_NOT_SUPPORTED; - } - return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value); +rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + if (!is_field_valid(field)) { + RDC_LOG(RDC_INFO, "Fail to get latest value with unknown field id " << field); + return RDC_ST_NOT_SUPPORTED; + } + return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value); } -rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) { - if (!next_since_time_stamp || !value) { - return RDC_ST_BAD_PARAMETER; - } - if (!is_field_valid(field)) { - RDC_LOG(RDC_INFO, - "Fail to get value since with unknown field id " - << field); - return RDC_ST_NOT_SUPPORTED; - } - return cache_mgr_->rdc_field_get_value_since(gpu_index, field, - since_time_stamp, next_since_time_stamp, value); +rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, + uint64_t* next_since_time_stamp, + rdc_field_value* value) { + if (!next_since_time_stamp || !value) { + return RDC_ST_BAD_PARAMETER; + } + if (!is_field_valid(field)) { + RDC_LOG(RDC_INFO, "Fail to get value since with unknown field id " << field); + return RDC_ST_NOT_SUPPORTED; + } + return cache_mgr_->rdc_field_get_value_since(gpu_index, field, since_time_stamp, + next_since_time_stamp, value); } rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) { - return watch_table_->rdc_field_unwatch(group_id, field_group_id); + rdc_field_grp_t field_group_id) { + return watch_table_->rdc_field_unwatch(group_id, field_group_id); } // Diagnostic API -rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run( - rdc_gpu_group_t group_id, - rdc_diag_level_t level, - rdc_diag_response_t* response) { - if (!response) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (!response) { + return RDC_ST_BAD_PARAMETER; + } - // Get GPU group information - rdc_group_info_t rdc_group_info; - rdc_status_t status = rdc_group_gpu_get_info( - group_id, &rdc_group_info); - if (status != RDC_ST_OK) return status; + // Get GPU group information + rdc_group_info_t rdc_group_info; + rdc_status_t status = rdc_group_gpu_get_info(group_id, &rdc_group_info); + if (status != RDC_ST_OK) return status; - auto diag = rdc_module_mgr_->get_diagnostic_module(); - return diag->rdc_diagnostic_run(rdc_group_info, level, response); + auto diag = rdc_module_mgr_->get_diagnostic_module(); + return diag->rdc_diagnostic_run(rdc_group_info, level, response); } -rdc_status_t RdcEmbeddedHandler::rdc_test_case_run( - rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, - rdc_diag_test_result_t* result) { - if (!result) { - return RDC_ST_BAD_PARAMETER; - } - // Get GPU group information - rdc_group_info_t rdc_group_info; - rdc_status_t status = rdc_group_gpu_get_info( - group_id, &rdc_group_info); - if (status != RDC_ST_OK) return status; +rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) { + if (!result) { + return RDC_ST_BAD_PARAMETER; + } + // Get GPU group information + rdc_group_info_t rdc_group_info; + rdc_status_t status = rdc_group_gpu_get_info(group_id, &rdc_group_info); + if (status != RDC_ST_OK) return status; - auto diag = rdc_module_mgr_->get_diagnostic_module(); - return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, - rdc_group_info.count, result); + auto diag = rdc_module_mgr_->get_diagnostic_module(); + return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count, + result); } // Control API -rdc_status_t RdcEmbeddedHandler::rdc_field_update_all( - uint32_t wait_for_update) { - if (wait_for_update == 1) { - return watch_table_->rdc_field_update_all(); - } +rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(uint32_t wait_for_update) { + if (wait_for_update == 1) { + return watch_table_->rdc_field_update_all(); + } - // Async update the field and return immediately. - updater_ = std::async(std::launch::async, [this](){ - watch_table_->rdc_field_update_all(); - }); + // Async update the field and return immediately. + updater_ = std::async(std::launch::async, [this]() { watch_table_->rdc_field_update_all(); }); - return RDC_ST_OK; + return RDC_ST_OK; } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index 8b2fb102a4..a086cedb6f 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -20,201 +20,190 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcGroupSettingsImpl.h" + #include -#include "rdc_lib/rdc_common.h" + #include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { RdcGroupSettingsImpl::RdcGroupSettingsImpl() { - // Add the default job stats fields - rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, - RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK, RDC_FI_GPU_UTIL, - RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, RDC_FI_MEM_CLOCK, - RDC_FI_GPU_TEMP}; - char job_field_group[] = "JobStatsFields"; - rdc_field_grp_t fgid = JOB_FIELD_ID; + // Add the default job stats fields + rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK, + RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, + RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP}; + char job_field_group[] = "JobStatsFields"; + rdc_field_grp_t fgid = JOB_FIELD_ID; - rdc_group_field_create(sizeof(job_fields)/sizeof(uint32_t), - job_fields, job_field_group, &fgid); + rdc_group_field_create(sizeof(job_fields) / sizeof(uint32_t), job_fields, job_field_group, &fgid); } -rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create( - const char* group_name, rdc_gpu_group_t* p_rdc_group_id) { - RDC_LOG(RDC_DEBUG, "Create group " << group_name); - rdc_group_info_t ginfo; - strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH); - ginfo.count = 0; +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) { + RDC_LOG(RDC_DEBUG, "Create group " << group_name); + rdc_group_info_t ginfo; + strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH); + ginfo.count = 0; - std::lock_guard guard(group_mutex_); - if (gpu_group_.size() >= RDC_MAX_NUM_GROUPS) { - return RDC_ST_MAX_LIMIT; - } - gpu_group_.emplace(cur_group_id_, ginfo); - *p_rdc_group_id = cur_group_id_; - cur_group_id_++; + std::lock_guard guard(group_mutex_); + if (gpu_group_.size() >= RDC_MAX_NUM_GROUPS) { + return RDC_ST_MAX_LIMIT; + } + gpu_group_.emplace(cur_group_id_, ginfo); + *p_rdc_group_id = cur_group_id_; + cur_group_id_++; - return RDC_ST_OK; + return RDC_ST_OK; } - -rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) { - std::lock_guard guard(group_mutex_); - if (!gpu_group_.erase(p_rdc_group_id)) - return RDC_ST_NOT_FOUND; - return RDC_ST_OK; +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) { + std::lock_guard guard(group_mutex_); + if (!gpu_group_.erase(p_rdc_group_id)) return RDC_ST_NOT_FOUND; + return RDC_ST_OK; } -rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add( - rdc_gpu_group_t groupId, uint32_t gpu_index ) { - std::lock_guard guard(group_mutex_); - auto ite = gpu_group_.find(groupId); - if (ite != gpu_group_.end()) { - // Check whether the index already exists - for (uint32_t i=0; i < ite->second.count; i++) { - if (ite->second.entity_ids[i] == gpu_index) { - RDC_LOG(RDC_INFO, "Fail to add " << gpu_index - <<" to GPU group " << groupId << " as it is already exists"); - return RDC_ST_BAD_PARAMETER; - } - } - if (ite->second.count < RDC_GROUP_MAX_ENTITIES) { - ite->second.entity_ids[ite->second.count] = gpu_index; - ite->second.count++; - } else { - return RDC_ST_MAX_LIMIT; - } - } else { - return RDC_ST_NOT_FOUND; - } - - return RDC_ST_OK; -} - -rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_get_info( - rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) { - std::lock_guard guard(group_mutex_); - auto ite = gpu_group_.find(p_rdc_group_id); - if (ite != gpu_group_.end()) { - auto info = ite->second; - strncpy_with_null(p_rdc_group_info->group_name, - info.group_name, RDC_MAX_STR_LENGTH); - p_rdc_group_info->count = info.count; - for (uint32_t i=0 ; i < info.count; i++) { - p_rdc_group_info->entity_ids[i]= info.entity_ids[i]; - } - } else { - return RDC_ST_NOT_FOUND; - } - - return RDC_ST_OK; -} - -rdc_status_t RdcGroupSettingsImpl::rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) { - if (!count) { +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) { + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.find(groupId); + if (ite != gpu_group_.end()) { + // Check whether the index already exists + for (uint32_t i = 0; i < ite->second.count; i++) { + if (ite->second.entity_ids[i] == gpu_index) { + RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId + << " as it is already exists"); return RDC_ST_BAD_PARAMETER; + } } - - *count = 0; - std::lock_guard guard(group_mutex_); - auto ite = gpu_group_.begin(); - for (; ite != gpu_group_.end(); ite++) { - if (*count >= RDC_MAX_NUM_GROUPS) { - return RDC_ST_MAX_LIMIT; - } - group_id_list[*count] = ite->first; - (*count)++; - } - - return RDC_ST_OK; -} - -rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create( - uint32_t num_field_ids, rdc_field_t* field_ids, - const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { - - RDC_LOG(RDC_DEBUG, "Create field group " << field_group_name); - rdc_field_group_info_t finfo; - finfo.count = num_field_ids; - strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH); - if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { - for (uint32_t i = 0; i < num_field_ids; i++) { - finfo.field_ids[i] = field_ids[i]; - } + if (ite->second.count < RDC_GROUP_MAX_ENTITIES) { + ite->second.entity_ids[ite->second.count] = gpu_index; + ite->second.count++; } else { - return RDC_ST_MAX_LIMIT; + return RDC_ST_MAX_LIMIT; } + } else { + return RDC_ST_NOT_FOUND; + } - std::lock_guard guard(field_group_mutex_); - if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) { - return RDC_ST_MAX_LIMIT; - } - field_group_.emplace(cur_field_group_id_, finfo); - *rdc_field_group_id = cur_field_group_id_; - cur_field_group_id_++; - - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) { - if (rdc_field_group_id == JOB_FIELD_ID) { - RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group"); - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) { + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.find(p_rdc_group_id); + if (ite != gpu_group_.end()) { + auto info = ite->second; + strncpy_with_null(p_rdc_group_info->group_name, info.group_name, RDC_MAX_STR_LENGTH); + p_rdc_group_info->count = info.count; + for (uint32_t i = 0; i < info.count; i++) { + p_rdc_group_info->entity_ids[i] = info.entity_ids[i]; } - std::lock_guard guard(field_group_mutex_); - if (!field_group_.erase(rdc_field_group_id)) - return RDC_ST_NOT_FOUND; - return RDC_ST_OK; + } else { + return RDC_ST_NOT_FOUND; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + + *count = 0; + std::lock_guard guard(group_mutex_); + auto ite = gpu_group_.begin(); + for (; ite != gpu_group_.end(); ite++) { + if (*count >= RDC_MAX_NUM_GROUPS) { + return RDC_ST_MAX_LIMIT; + } + group_id_list[*count] = ite->first; + (*count)++; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create(uint32_t num_field_ids, + rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) { + RDC_LOG(RDC_DEBUG, "Create field group " << field_group_name); + rdc_field_group_info_t finfo; + finfo.count = num_field_ids; + strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH); + if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { + for (uint32_t i = 0; i < num_field_ids; i++) { + finfo.field_ids[i] = field_ids[i]; + } + } else { + return RDC_ST_MAX_LIMIT; + } + + std::lock_guard guard(field_group_mutex_); + if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) { + return RDC_ST_MAX_LIMIT; + } + field_group_.emplace(cur_field_group_id_, finfo); + *rdc_field_group_id = cur_field_group_id_; + cur_field_group_id_++; + + return RDC_ST_OK; +} + +rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) { + if (rdc_field_group_id == JOB_FIELD_ID) { + RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group"); + return RDC_ST_BAD_PARAMETER; + } + std::lock_guard guard(field_group_mutex_); + if (!field_group_.erase(rdc_field_group_id)) return RDC_ST_NOT_FOUND; + return RDC_ST_OK; } rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) { - - std::lock_guard guard(field_group_mutex_); - auto ite = field_group_.find(rdc_field_group_id); - if (ite != field_group_.end()) { - auto info = ite->second; - strncpy_with_null(field_group_info->group_name, info.group_name, - RDC_MAX_STR_LENGTH); - field_group_info->count = info.count; - for (uint32_t i=0 ; i < info.count; i++) { - field_group_info->field_ids[i]= info.field_ids[i]; - } - } else { - return RDC_ST_NOT_FOUND; + rdc_field_grp_t rdc_field_group_id, rdc_field_group_info_t* field_group_info) { + std::lock_guard guard(field_group_mutex_); + auto ite = field_group_.find(rdc_field_group_id); + if (ite != field_group_.end()) { + auto info = ite->second; + strncpy_with_null(field_group_info->group_name, info.group_name, RDC_MAX_STR_LENGTH); + field_group_info->count = info.count; + for (uint32_t i = 0; i < info.count; i++) { + field_group_info->field_ids[i] = info.field_ids[i]; } - return RDC_ST_OK; + } else { + return RDC_ST_NOT_FOUND; + } + return RDC_ST_OK; } rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_all_ids( - rdc_field_grp_t field_group_id_list[], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; + rdc_field_grp_t field_group_id_list[], uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + + *count = 0; + std::lock_guard guard(field_group_mutex_); + auto ite = field_group_.begin(); + for (; ite != field_group_.end(); ite++) { + if (*count >= RDC_MAX_NUM_FIELD_GROUPS) { + return RDC_ST_MAX_LIMIT; } - *count = 0; - std::lock_guard guard(field_group_mutex_); - auto ite = field_group_.begin(); - for (; ite != field_group_.end(); ite++) { - if (*count >= RDC_MAX_NUM_FIELD_GROUPS) { - return RDC_ST_MAX_LIMIT; - } + // Skip system defined JOB_FIELD_ID + if (ite->first == JOB_FIELD_ID) continue; - // Skip system defined JOB_FIELD_ID - if (ite->first == JOB_FIELD_ID) continue; + field_group_id_list[*count] = ite->first; + (*count)++; + } - field_group_id_list[*count] = ite->first; - (*count)++; - } - - return RDC_ST_OK; + return RDC_ST_OK; } - } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 12bd53935e..fbbccdd45a 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -20,34 +20,35 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcMetricFetcherImpl.h" -#include -#include -#include -#include //NOLINT +#include +#include +#include + #include -#include +#include //NOLINT #include -#include "rdc_lib/rdc_common.h" +#include + +#include "common/rdc_capabilities.h" #include "common/rdc_fields_supported.h" #include "rdc_lib/RdcLogger.h" -#include "rocm_smi/rocm_smi.h" #include "rdc_lib/impl/RsmiUtils.h" -#include "common/rdc_capabilities.h" +#include "rdc_lib/rdc_common.h" +#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { -static const std::unordered_map - rdc_evnt_2_rsmi_field = { - {RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX}, - {RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX}, +static const std::unordered_map rdc_evnt_2_rsmi_field = { + {RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX}, + {RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX}, {RDC_EVNT_XGMI_0_RESP_TX, RSMI_EVNT_XGMI_0_RESPONSE_TX}, - {RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX}, - {RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX}, - {RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX}, + {RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX}, + {RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX}, + {RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX}, {RDC_EVNT_XGMI_1_RESP_TX, RSMI_EVNT_XGMI_1_RESPONSE_TX}, - {RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX}, + {RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX}, {RDC_EVNT_XGMI_0_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_0}, {RDC_EVNT_XGMI_1_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_1}, @@ -58,447 +59,420 @@ static const std::unordered_map }; RdcMetricFetcherImpl::RdcMetricFetcherImpl() { - task_started_ = true; + task_started_ = true; - // kick off another thread for async fetch - updater_ = std::async(std::launch::async, [this]() { - while (task_started_) { - std::unique_lock lk(task_mutex_); - // Wait for tasks or stop signal - cv_.wait(lk, [this]{ - return !updated_tasks_.empty() || !task_started_; - }); - if (updated_tasks_.empty()) continue; + // kick off another thread for async fetch + updater_ = std::async(std::launch::async, [this]() { + while (task_started_) { + std::unique_lock lk(task_mutex_); + // Wait for tasks or stop signal + cv_.wait(lk, [this] { return !updated_tasks_.empty() || !task_started_; }); + if (updated_tasks_.empty()) continue; - // Get the tasks - auto item = updated_tasks_.front(); - updated_tasks_.pop(); - // The task may take long time, release lock - lk.unlock(); + // Get the tasks + auto item = updated_tasks_.front(); + updated_tasks_.pop(); + // The task may take long time, release lock + lk.unlock(); - // run task - item.task(*this, item.field); - } // end while (task_started_) - }); + // run task + item.task(*this, item.field); + } // end while (task_started_) + }); } RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { - // Notify the async task to stop - task_started_ = false; - cv_.notify_all(); + // Notify the async task to stop + task_started_ = false; + cv_.notify_all(); } uint64_t RdcMetricFetcherImpl::now() { - struct timeval tv; - gettimeofday(&tv, NULL); - return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; } -void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value) { - rsmi_status_t err = RSMI_STATUS_SUCCESS; - uint64_t correctable_err = 0; - uint64_t uncorrectable_err = 0; - rsmi_ras_err_state_t err_state; +void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) { + rsmi_status_t err = RSMI_STATUS_SUCCESS; + uint64_t correctable_err = 0; + uint64_t uncorrectable_err = 0; + rsmi_ras_err_state_t err_state; - if (!value) { - return; - } - for (uint32_t b = RSMI_GPU_BLOCK_FIRST; - b <= RSMI_GPU_BLOCK_LAST; b = b*2) { - err = rsmi_dev_ecc_status_get(gpu_index, static_cast(b), - &err_state); - if (err != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, "Get the ecc Status error " << b - << ":" << err); - continue; - } - - rsmi_error_count_t ec; - err = rsmi_dev_ecc_count_get(gpu_index, - static_cast(b), &ec); - - if (err == RSMI_STATUS_SUCCESS) { - correctable_err += ec.correctable_err; - uncorrectable_err += ec.uncorrectable_err; - } + if (!value) { + return; + } + for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; b = b * 2) { + err = rsmi_dev_ecc_status_get(gpu_index, static_cast(b), &err_state); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Get the ecc Status error " << b << ":" << err); + continue; } - value->status = RSMI_STATUS_SUCCESS; - value->type = INTEGER; - if (field_id == RDC_FI_ECC_CORRECT_TOTAL) { - value->value.l_int = correctable_err; - } - if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) { - value->value.l_int = uncorrectable_err; + rsmi_error_count_t ec; + err = rsmi_dev_ecc_count_get(gpu_index, static_cast(b), &ec); + + if (err == RSMI_STATUS_SUCCESS) { + correctable_err += ec.correctable_err; + uncorrectable_err += ec.uncorrectable_err; } + } + + value->status = RSMI_STATUS_SUCCESS; + value->type = INTEGER; + if (field_id == RDC_FI_ECC_CORRECT_TOTAL) { + value->value.l_int = correctable_err; + } + if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) { + value->value.l_int = uncorrectable_err; + } } -bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value) { - if (!value) { +bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) { + if (!value) { + return false; + } + + do { + std::lock_guard guard(task_mutex_); + auto metric = async_metrics_.find({gpu_index, field_id}); + if (metric != async_metrics_.end()) { + if (now() < metric->second.last_time + metric->second.cache_ttl) { + RDC_LOG(RDC_DEBUG, + "Fetch " << gpu_index << ":" << field_id_string(field_id) << " from cache"); + value->status = metric->second.value.status; + value->type = metric->second.value.type; + value->value = metric->second.value.value; return false; + } } - do { - std::lock_guard guard(task_mutex_); - auto metric = async_metrics_.find({gpu_index, field_id}); - if ( metric != async_metrics_.end() ) { - if (now() < metric->second.last_time + metric->second.cache_ttl) { - RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << - field_id_string(field_id) << " from cache"); - value->status = metric->second.value.status; - value->type = metric->second.value.type; - value->value = metric->second.value.value; - return false; - } - } + // add to the async task queue + MetricTask t; + t.field = {gpu_index, field_id}; + t.task = &RdcMetricFetcherImpl::get_pcie_throughput; + updated_tasks_.push(t); - // add to the async task queue - MetricTask t; - t.field = {gpu_index, field_id}; - t.task = &RdcMetricFetcherImpl::get_pcie_throughput; - updated_tasks_.push(t); + RDC_LOG(RDC_DEBUG, + "Start async fetch " << gpu_index << ":" << field_id_string(field_id) << " to cache."); + } while (0); + cv_.notify_all(); - RDC_LOG(RDC_DEBUG, "Start async fetch " << gpu_index << ":" << - field_id_string(field_id) << " to cache."); - } while (0); - cv_.notify_all(); - - return true; + return true; } void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { - uint32_t gpu_index = key.first; - uint64_t sent, received, max_pkt_sz; - rsmi_status_t ret; + uint32_t gpu_index = key.first; + uint64_t sent, received, max_pkt_sz; + rsmi_status_t ret; - // Return if the cache does not expire yet - do { - std::lock_guard guard(task_mutex_); - auto metric = async_metrics_.find(key); - if (metric != async_metrics_.end() && - now() < metric->second.last_time + metric->second.cache_ttl) { - return; - } - } while (0); + // Return if the cache does not expire yet + do { + std::lock_guard guard(task_mutex_); + auto metric = async_metrics_.find(key); + if (metric != async_metrics_.end() && + now() < metric->second.last_time + metric->second.cache_ttl) { + return; + } + } while (0); - ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz); + ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz); - uint64_t curTime = now(); - MetricValue value; - value.cache_ttl = 30*1000; // cache 30 seconds - value.value.type = INTEGER; - do { - std::lock_guard guard(task_mutex_); - // Create new cache entry it does not exist - auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX}); - if (tx_metric == async_metrics_.end()) { - tx_metric = async_metrics_.insert( - {{gpu_index, RDC_FI_PCIE_TX}, value}).first; - tx_metric->second.value.field_id = RDC_FI_PCIE_TX; - } - auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX}); - if (rx_metric == async_metrics_.end()) { - rx_metric = async_metrics_.insert( - {{gpu_index, RDC_FI_PCIE_RX}, value}).first; - rx_metric->second.value.field_id = RDC_FI_PCIE_RX; - } + uint64_t curTime = now(); + MetricValue value; + value.cache_ttl = 30 * 1000; // cache 30 seconds + value.value.type = INTEGER; + do { + std::lock_guard guard(task_mutex_); + // Create new cache entry it does not exist + auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX}); + if (tx_metric == async_metrics_.end()) { + tx_metric = async_metrics_.insert({{gpu_index, RDC_FI_PCIE_TX}, value}).first; + tx_metric->second.value.field_id = RDC_FI_PCIE_TX; + } + auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX}); + if (rx_metric == async_metrics_.end()) { + rx_metric = async_metrics_.insert({{gpu_index, RDC_FI_PCIE_RX}, value}).first; + rx_metric->second.value.field_id = RDC_FI_PCIE_RX; + } - // Always update the status and last_time - tx_metric->second.last_time = curTime; - tx_metric->second.value.status = ret; - tx_metric->second.value.ts = curTime; + // Always update the status and last_time + tx_metric->second.last_time = curTime; + tx_metric->second.value.status = ret; + tx_metric->second.value.ts = curTime; - rx_metric->second.last_time = curTime; - rx_metric->second.value.status = ret; - rx_metric->second.value.ts = curTime; + rx_metric->second.last_time = curTime; + rx_metric->second.value.status = ret; + rx_metric->second.value.ts = curTime; - if (ret == RSMI_STATUS_NOT_SUPPORTED) { - RDC_LOG(RDC_ERROR, - "PCIe throughput not supported on GPU " << gpu_index); - return; - } + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + RDC_LOG(RDC_ERROR, "PCIe throughput not supported on GPU " << gpu_index); + return; + } - if (ret == RSMI_STATUS_SUCCESS) { - rx_metric->second.value.value.l_int = received; - tx_metric->second.value.value.l_int = sent; - RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" << - "RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache."); - } - } while (0); + if (ret == RSMI_STATUS_SUCCESS) { + rx_metric->second.value.value.l_int = received; + tx_metric->second.value.value.l_int = sent; + RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" + << "RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache."); + } + } while (0); } rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( - rdc_gpu_field_t* fields, uint32_t fields_count, - std::vector& results) { // NOLINT - const std::set rdc_bulk_fields = { - RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000 + rdc_gpu_field_t* fields, uint32_t fields_count, + std::vector& results) { // NOLINT + const std::set rdc_bulk_fields = { + RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000 RDC_FI_MEMORY_TEMP, // temperature_mem * 1000 - RDC_FI_GPU_TEMP, // temperature_edge * 1000 + RDC_FI_GPU_TEMP, // temperature_edge * 1000 RDC_FI_POWER_USAGE, // average_socket_power * 1000000 - RDC_FI_GPU_UTIL // average_gfx_activity - }; + RDC_FI_GPU_UTIL // average_gfx_activity + }; - // To prevent always call the bulk API even if it is not supported, - // the static is used to cache last try. - static rsmi_status_t rs = RSMI_STATUS_SUCCESS; - if (rs != RSMI_STATUS_SUCCESS) { + // To prevent always call the bulk API even if it is not supported, + // the static is used to cache last try. + static rsmi_status_t rs = RSMI_STATUS_SUCCESS; + if (rs != RSMI_STATUS_SUCCESS) { + results.clear(); + return RDC_ST_NOT_SUPPORTED; + } + + // Organize the fields per GPU + std::map> bulk_fields; + for (uint32_t i = 0; i < fields_count; i++) { + if (rdc_bulk_fields.find(fields[i].field_id) != rdc_bulk_fields.end()) { + bulk_fields[fields[i].gpu_index].push_back(fields[i].field_id); + } + } + + // Call the rocm_smi_lib API to bulk fetch the data + auto cur_time = now(); + auto ite = bulk_fields.begin(); + for (; ite != bulk_fields.end(); ite++) { + rsmi_gpu_metrics_t gpu_metrics; + rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics); + if (rs != RSMI_STATUS_SUCCESS) { results.clear(); return RDC_ST_NOT_SUPPORTED; } + for (uint32_t j = 0; j < ite->second.size(); j++) { + auto field_id = ite->second[j]; + rdc_gpu_field_value_t value; + value.gpu_index = ite->first; + value.field_value.field_id = field_id; + value.field_value.type = INTEGER; + value.field_value.status = RSMI_STATUS_SUCCESS; + value.field_value.ts = cur_time; - // Organize the fields per GPU - std::map> bulk_fields; - for (uint32_t i = 0; i < fields_count; i++) { - if (rdc_bulk_fields.find(fields[i].field_id) != rdc_bulk_fields.end()) { - bulk_fields[fields[i].gpu_index].push_back(fields[i].field_id); - } - } - - // Call the rocm_smi_lib API to bulk fetch the data - auto cur_time = now(); - auto ite = bulk_fields.begin(); - for (; ite != bulk_fields.end(); ite++) { - rsmi_gpu_metrics_t gpu_metrics; - rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics); - if (rs != RSMI_STATUS_SUCCESS) { - results.clear(); - return RDC_ST_NOT_SUPPORTED; - } - for (uint32_t j=0; j < ite->second.size(); j++) { - auto field_id = ite->second[j]; - rdc_gpu_field_value_t value; - value.gpu_index = ite->first; - value.field_value.field_id = field_id; - value.field_value.type = INTEGER; - value.field_value.status = RSMI_STATUS_SUCCESS; - value.field_value.ts = cur_time; - - switch (field_id) { - case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000 - value.field_value.value.l_int = + switch (field_id) { + case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000 + value.field_value.value.l_int = static_cast(gpu_metrics.current_gfxclk * 1000000); - break; - case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000 - value.field_value.value.l_int = - static_cast(gpu_metrics.temperature_mem * 1000); - break; - case RDC_FI_GPU_TEMP: // temperature_edge * 1000 - value.field_value.value.l_int = - static_cast(gpu_metrics.temperature_edge * 1000); - break; - case RDC_FI_POWER_USAGE: // average_socket_power * 1000000 - value.field_value.value.l_int = + break; + case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000 + value.field_value.value.l_int = static_cast(gpu_metrics.temperature_mem * 1000); + break; + case RDC_FI_GPU_TEMP: // temperature_edge * 1000 + value.field_value.value.l_int = static_cast(gpu_metrics.temperature_edge * 1000); + break; + case RDC_FI_POWER_USAGE: // average_socket_power * 1000000 + value.field_value.value.l_int = static_cast(gpu_metrics.average_socket_power * 1000000); - // Ignore if the power is 0, which will fallback to non-bulk fetch. - if (value.field_value.value.l_int == 0) { - RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":" << - "RDC_FI_POWER_USAGE fallback to regular way."); - continue; - } - break; - case RDC_FI_GPU_UTIL: // average_gfx_activity - value.field_value.value.l_int = - static_cast(gpu_metrics.average_gfx_activity); - break; - default: - value.field_value.status = RSMI_STATUS_NOT_SUPPORTED; - break; - } - if (value.field_value.status == RSMI_STATUS_SUCCESS) { - results.push_back(value); - } + // Ignore if the power is 0, which will fallback to non-bulk fetch. + if (value.field_value.value.l_int == 0) { + RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":" + << "RDC_FI_POWER_USAGE fallback to regular way."); + continue; + } + break; + case RDC_FI_GPU_UTIL: // average_gfx_activity + value.field_value.value.l_int = static_cast(gpu_metrics.average_gfx_activity); + break; + default: + value.field_value.status = RSMI_STATUS_NOT_SUPPORTED; + break; + } + if (value.field_value.status == RSMI_STATUS_SUCCESS) { + results.push_back(value); } } + } - return RDC_ST_OK; + return RDC_ST_OK; } static const uint64_t kGig = 1000000000; -rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, - rdc_field_t field_id, rdc_field_value* value) { - if (!value) { - return RDC_ST_BAD_PARAMETER; - } - uint64_t i64 = 0; - rsmi_temperature_type_t sensor_type; - rsmi_clk_type_t clk_type; - bool async_fetching = false; - RdcFieldKey f_key(gpu_index, field_id); - std::shared_ptr rsmi_data; - double coll_time_sec; +rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + uint64_t i64 = 0; + rsmi_temperature_type_t sensor_type; + rsmi_clk_type_t clk_type; + bool async_fetching = false; + RdcFieldKey f_key(gpu_index, field_id); + std::shared_ptr rsmi_data; + double coll_time_sec; - if (!is_field_valid(field_id)) { - RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id - << " which is not supported"); - return RDC_ST_NOT_SUPPORTED; + if (!is_field_valid(field_id)) { + RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported"); + return RDC_ST_NOT_SUPPORTED; + } + + value->ts = now(); + value->field_id = field_id; + value->status = RSMI_STATUS_NOT_SUPPORTED; + + auto read_rsmi_counter = [&](void) { + rsmi_data = get_rsmi_data(f_key); + if (rsmi_data == nullptr) { + value->status = RSMI_STATUS_NOT_SUPPORTED; + return; } - value->ts = now(); - value->field_id = field_id; - value->status = RSMI_STATUS_NOT_SUPPORTED; + value->status = rsmi_counter_read(rsmi_data->evt_handle, &rsmi_data->counter_val); + value->value.l_int = rsmi_data->counter_val.value; + value->type = INTEGER; + }; - auto read_rsmi_counter = [&](void) { - rsmi_data = get_rsmi_data(f_key); - if (rsmi_data == nullptr) { - value->status = RSMI_STATUS_NOT_SUPPORTED; - return; - } - - value->status = rsmi_counter_read(rsmi_data->evt_handle, - &rsmi_data->counter_val); - value->value.l_int = rsmi_data->counter_val.value; + switch (field_id) { + case RDC_FI_GPU_MEMORY_USAGE: + value->status = rsmi_dev_memory_usage_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64); value->type = INTEGER; - }; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_MEMORY_TOTAL: + value->status = rsmi_dev_memory_total_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_COUNT: + uint32_t num_gpu; + value->status = rsmi_num_monitor_devices(&num_gpu); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(num_gpu); + } + break; + case RDC_FI_POWER_USAGE: + value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(i64); + } + break; + case RDC_FI_GPU_CLOCK: + case RDC_FI_MEM_CLOCK: + rsmi_frequencies_t f; + clk_type = RSMI_CLK_TYPE_SYS; + if (field_id == RDC_FI_MEM_CLOCK) { + clk_type = RSMI_CLK_TYPE_MEM; + } + value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, clk_type, &f); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = f.frequency[f.current]; + } + break; + case RDC_FI_GPU_UTIL: + uint32_t busy_percent; + value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(busy_percent); + } + break; + case RDC_FI_DEV_NAME: + value->status = rsmi_dev_name_get(gpu_index, value->value.str, RDC_MAX_STR_LENGTH); + value->type = STRING; + break; + case RDC_FI_GPU_TEMP: + case RDC_FI_MEMORY_TEMP: + int64_t val_i64; + sensor_type = RSMI_TEMP_TYPE_EDGE; + if (field_id == RDC_FI_MEMORY_TEMP) { + sensor_type = RSMI_TEMP_TYPE_MEMORY; + } + value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64); - switch (field_id) { - case RDC_FI_GPU_MEMORY_USAGE: - value->status = rsmi_dev_memory_usage_get(gpu_index, - RSMI_MEM_TYPE_VRAM, &i64); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(i64); - } - break; - case RDC_FI_GPU_MEMORY_TOTAL: - value->status = rsmi_dev_memory_total_get(gpu_index, - RSMI_MEM_TYPE_VRAM, &i64); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(i64); - } - break; - case RDC_FI_GPU_COUNT: - uint32_t num_gpu; - value->status = rsmi_num_monitor_devices(&num_gpu); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(num_gpu); - } - break; - case RDC_FI_POWER_USAGE: - value->status = rsmi_dev_power_ave_get(gpu_index, - RSMI_TEMP_CURRENT, &i64); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(i64); - } - break; - case RDC_FI_GPU_CLOCK: - case RDC_FI_MEM_CLOCK: - rsmi_frequencies_t f; - clk_type = RSMI_CLK_TYPE_SYS; - if (field_id == RDC_FI_MEM_CLOCK) { - clk_type = RSMI_CLK_TYPE_MEM; - } - value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, - clk_type, &f); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = f.frequency[f.current]; - } - break; - case RDC_FI_GPU_UTIL: - uint32_t busy_percent; - value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(busy_percent); - } - break; - case RDC_FI_DEV_NAME: - value->status = rsmi_dev_name_get(gpu_index, - value->value.str, RDC_MAX_STR_LENGTH); - value->type = STRING; - break; - case RDC_FI_GPU_TEMP: - case RDC_FI_MEMORY_TEMP: - int64_t val_i64; - sensor_type = RSMI_TEMP_TYPE_EDGE; - if (field_id == RDC_FI_MEMORY_TEMP) { - sensor_type = RSMI_TEMP_TYPE_MEMORY; - } - value->status = rsmi_dev_temp_metric_get(gpu_index, - sensor_type , RSMI_TEMP_CURRENT, &val_i64); - - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = val_i64; - } - break; - case RDC_FI_ECC_CORRECT_TOTAL: - case RDC_FI_ECC_UNCORRECT_TOTAL: - get_ecc_error(gpu_index, field_id, value); - break; - case RDC_FI_PCIE_TX: - case RDC_FI_PCIE_RX: - async_fetching = async_get_pcie_throughput( - gpu_index, field_id, value); - break; - case RDC_EVNT_XGMI_0_NOP_TX: - case RDC_EVNT_XGMI_0_REQ_TX: - case RDC_EVNT_XGMI_0_RESP_TX: - case RDC_EVNT_XGMI_0_BEATS_TX: - case RDC_EVNT_XGMI_1_NOP_TX: - case RDC_EVNT_XGMI_1_REQ_TX: - case RDC_EVNT_XGMI_1_RESP_TX: - case RDC_EVNT_XGMI_1_BEATS_TX: - read_rsmi_counter(); - break; - case RDC_EVNT_XGMI_0_THRPUT: - case RDC_EVNT_XGMI_1_THRPUT: - case RDC_EVNT_XGMI_2_THRPUT: - case RDC_EVNT_XGMI_3_THRPUT: - case RDC_EVNT_XGMI_4_THRPUT: - case RDC_EVNT_XGMI_5_THRPUT: - read_rsmi_counter(); - if (value->status == RDC_ST_OK) { - if (rsmi_data->counter_val.time_running > 0) { - coll_time_sec = - static_cast(rsmi_data->counter_val.time_running)/kGig; - value->value.l_int = (value->value.l_int * 32)/coll_time_sec; - } else { - value->value.l_int = 0; - } - } - break; - - default: - break; - } - - int64_t latency = now()-value->ts; - if (value->status != RSMI_STATUS_SUCCESS) { - if (async_fetching) { //!< Async fetching is not an error - RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id)); + value->type = INTEGER; + if (value->status == RSMI_STATUS_SUCCESS) { + value->value.l_int = val_i64; + } + break; + case RDC_FI_ECC_CORRECT_TOTAL: + case RDC_FI_ECC_UNCORRECT_TOTAL: + get_ecc_error(gpu_index, field_id, value); + break; + case RDC_FI_PCIE_TX: + case RDC_FI_PCIE_RX: + async_fetching = async_get_pcie_throughput(gpu_index, field_id, value); + break; + case RDC_EVNT_XGMI_0_NOP_TX: + case RDC_EVNT_XGMI_0_REQ_TX: + case RDC_EVNT_XGMI_0_RESP_TX: + case RDC_EVNT_XGMI_0_BEATS_TX: + case RDC_EVNT_XGMI_1_NOP_TX: + case RDC_EVNT_XGMI_1_REQ_TX: + case RDC_EVNT_XGMI_1_RESP_TX: + case RDC_EVNT_XGMI_1_BEATS_TX: + read_rsmi_counter(); + break; + case RDC_EVNT_XGMI_0_THRPUT: + case RDC_EVNT_XGMI_1_THRPUT: + case RDC_EVNT_XGMI_2_THRPUT: + case RDC_EVNT_XGMI_3_THRPUT: + case RDC_EVNT_XGMI_4_THRPUT: + case RDC_EVNT_XGMI_5_THRPUT: + read_rsmi_counter(); + if (value->status == RDC_ST_OK) { + if (rsmi_data->counter_val.time_running > 0) { + coll_time_sec = static_cast(rsmi_data->counter_val.time_running) / kGig; + value->value.l_int = (value->value.l_int * 32) / coll_time_sec; } else { - RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << - field_id_string(field_id) << " with rsmi error code " - << value->status <<", latency " << latency); + value->value.l_int = 0; } - } else if (value->type == INTEGER) { - RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << - field_id_string(field_id) << ":" << value->value.l_int - << ", latency " << latency); - } else if (value->type == DOUBLE) { - RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << - field_id_string(field_id) << ":" << value->value.dbl - << ", latency " << latency); - } else if (value->type == STRING) { - RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << - field_id_string(field_id) << ":" << value->value.str - << ", latency " << latency); - } + } + break; - return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; + default: + break; + } + + int64_t latency = now() - value->ts; + if (value->status != RSMI_STATUS_SUCCESS) { + if (async_fetching) { //!< Async fetching is not an error + RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id)); + } else { + RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << field_id_string(field_id) + << " with rsmi error code " << value->status + << ", latency " << latency); + } + } else if (value->type == INTEGER) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":" + << value->value.l_int << ", latency " << latency); + } else if (value->type == DOUBLE) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":" + << value->value.dbl << ", latency " << latency); + } else if (value->type == STRING) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":" + << value->value.str << ", latency " << latency); + } + + return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; } -std::shared_ptr -RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) { - std::map>::iterator r_info = - rsmi_data_.find(key); +std::shared_ptr RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) { + std::map>::iterator r_info = rsmi_data_.find(key); if (r_info != rsmi_data_.end()) { return r_info->second; @@ -506,8 +480,8 @@ RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) { return nullptr; } -static rdc_status_t init_rsmi_counter(RdcFieldKey fk, - rsmi_event_group_t grp, rsmi_event_handle_t *handle) { +static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp, + rsmi_event_handle_t* handle) { rsmi_status_t ret; uint32_t counters_available; uint32_t dv_ind = fk.first; @@ -535,8 +509,7 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); if (sc.error()) { - RDC_LOG(RDC_ERROR, - "Failed to acquire required capabilities. Errno " << sc.error()); + RDC_LOG(RDC_ERROR, "Failed to acquire required capabilities. Errno " << sc.error()); return RDC_ST_PERM_ERROR; } @@ -551,8 +524,7 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, sc.Relinquish(); if (sc.error()) { - RDC_LOG(RDC_ERROR, - "Failed to relinquish capabilities. Errno " << sc.error()); + RDC_LOG(RDC_ERROR, "Failed to relinquish capabilities. Errno " << sc.error()); return RDC_ST_PERM_ERROR; } @@ -589,8 +561,7 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) { if (ret != RSMI_STATUS_SUCCESS) { rsmi_data_.erase(fk); - RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << - Rsmi2RdcError(ret)); + RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Rsmi2RdcError(ret)); return Rsmi2RdcError(ret); } @@ -662,11 +633,11 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { } if (ret == RDC_ST_INSUFF_RESOURCES) { - amd::rdc::fld_id2name_map_t &field_id_to_descript = - amd::rdc::get_field_id_description_from_id(); + amd::rdc::fld_id2name_map_t& field_id_to_descript = + amd::rdc::get_field_id_description_from_id(); - RDC_LOG(RDC_ERROR, "No event counters are available for " << - field_id_to_descript.at(fk.second).enum_name << " event."); + RDC_LOG(RDC_ERROR, "No event counters are available for " + << field_id_to_descript.at(fk.second).enum_name << " event."); } else if (ret != RDC_ST_OK) { RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret); } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc index 7b550d3d6d..9f07953ca7 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc @@ -20,22 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" + #include + +#include // NOLINT(build/c++11) #include #include -#include // NOLINT(build/c++11) + #include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl( - const RdcWatchTablePtr& watch_table, - const uint32_t check_frequency): - watch_table_(watch_table) - , started_(false) - , _check_frequency(check_frequency) { -} +RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl(const RdcWatchTablePtr& watch_table, + const uint32_t check_frequency) + : watch_table_(watch_table), started_(false), _check_frequency(check_frequency) {} // Make the listen time for notifications a relatively long time. // There's no point in starting/stopping it constantly. @@ -43,29 +42,25 @@ static const uint32_t kRdcFieldListenNotifTime_mS = 10000; static const uint32_t kRdcEventCheck_ms = 1000; void RdcMetricsUpdaterImpl::start() { - if (started_) { - return; + if (started_) { + return; + } + started_ = true; + notif_updater_ = std::async(std::launch::async, [this]() { + while (started_) { + watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS); + std::this_thread::sleep_for(std::chrono::milliseconds(kRdcEventCheck_ms)); } - started_ = true; - notif_updater_ = std::async(std::launch::async, [this](){ - while (started_) { - watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS); - std::this_thread::sleep_for( - std::chrono::milliseconds(kRdcEventCheck_ms)); - } - }); - updater_ = std::async(std::launch::async, [this](){ - while (started_) { - watch_table_->rdc_field_update_all(); - std::this_thread::sleep_for( - std::chrono::microseconds(_check_frequency)); - } - }); + }); + updater_ = std::async(std::launch::async, [this]() { + while (started_) { + watch_table_->rdc_field_update_all(); + std::this_thread::sleep_for(std::chrono::microseconds(_check_frequency)); + } + }); } -void RdcMetricsUpdaterImpl::stop() { - started_ = false; -} +void RdcMetricsUpdaterImpl::stop() { started_ = false; } } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc index a900dc0504..4a694afcf5 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc @@ -29,33 +29,30 @@ THE SOFTWARE. namespace amd { namespace rdc { -RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher) - : fetcher_(fetcher) {} +RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher) : fetcher_(fetcher) {} RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() { - if (rdc_telemetry_module_) { - return rdc_telemetry_module_; - } - - if (!rdc_telemetry_module_) { - rdc_telemetry_module_.reset( - new RdcTelemetryModule(fetcher_)); - } - + if (rdc_telemetry_module_) { return rdc_telemetry_module_; + } + + if (!rdc_telemetry_module_) { + rdc_telemetry_module_.reset(new RdcTelemetryModule(fetcher_)); + } + + return rdc_telemetry_module_; } RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() { - if (rdc_diagnostic_module_) { - return rdc_diagnostic_module_; - } - - if (!rdc_diagnostic_module_) { - rdc_diagnostic_module_.reset( - new RdcDiagnosticModule(fetcher_)); - } - + if (rdc_diagnostic_module_) { return rdc_diagnostic_module_; + } + + if (!rdc_diagnostic_module_) { + rdc_diagnostic_module_.reset(new RdcDiagnosticModule(fetcher_)); + } + + return rdc_diagnostic_module_; } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc index 2686d0e07e..72e9d40e30 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -19,73 +19,66 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_lib/impl/RdcNotificationImpl.h" + #include #include -#include +#include +#include // NOLINT #include #include -#include // NOLINT +#include "common/rdc_capabilities.h" #include "rdc/rdc.h" -#include "rdc_lib/impl/RdcTelemetryModule.h" -#include "rdc_lib/impl/RdcNotificationImpl.h" -#include "rdc_lib/impl/RsmiUtils.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcSmiLib.h" +#include "rdc_lib/impl/RdcTelemetryModule.h" +#include "rdc_lib/impl/RsmiUtils.h" #include "rocm_smi/rocm_smi.h" -#include "common/rdc_capabilities.h" namespace amd { namespace rdc { -static std::unordered_map - rdc_2_rsmi_event_notif_map = { - {RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT}, - {RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST}, - {RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE}, - {RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET}, - {RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET}, +static std::unordered_map rdc_2_rsmi_event_notif_map = { + {RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT}, + {RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST}, + {RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE}, + {RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET}, + {RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET}, }; -static std::unordered_map - rsmi_event_notif_2_rdc_map = { - {RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, - {RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST}, - {RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, - {RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, - {RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, +static std::unordered_map rsmi_event_notif_2_rdc_map = { + {RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, + {RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST}, + {RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, + {RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, + {RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, }; // This const determines space allocated on stack for notification events. const uint32_t kMaxRSMIEvents = 64; -RdcNotificationImpl::RdcNotificationImpl() { -} +RdcNotificationImpl::RdcNotificationImpl() {} -RdcNotificationImpl::~RdcNotificationImpl() { -} +RdcNotificationImpl::~RdcNotificationImpl() {} -bool -RdcNotificationImpl::is_notification_event(rdc_field_t field) const { - if (rdc_2_rsmi_event_notif_map.find(field) == - rdc_2_rsmi_event_notif_map.end()) { +bool RdcNotificationImpl::is_notification_event(rdc_field_t field) const { + if (rdc_2_rsmi_event_notif_map.find(field) == rdc_2_rsmi_event_notif_map.end()) { return false; } return true; } -rdc_status_t -RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { +rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { rsmi_status_t ret; std::map new_masks; for (uint32_t i = 0; i < fk_arr.size(); ++i) { - if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == - rdc_2_rsmi_event_notif_map.end()) { + if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == rdc_2_rsmi_event_notif_map.end()) { continue; } new_masks[fk_arr[i].first] |= - RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]); + RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]); } std::map::iterator it = new_masks.begin(); @@ -101,17 +94,15 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); if (sc.error()) { - RDC_LOG(RDC_ERROR, - "Failed to acquire required capabilities. Errno " << sc.error()); - return RDC_ST_PERM_ERROR; + RDC_LOG(RDC_ERROR, "Failed to acquire required capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; } ret = rsmi_event_notification_init(it->first); if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "rsmi_event_notification_init() returned " << ret << " for device " << - it->first << ". " << std::endl << - " Will not listen for events on this device"); + RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned " + << ret << " for device " << it->first << ". " << std::endl + << " Will not listen for events on this device"); continue; } @@ -120,18 +111,17 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { sc.Relinquish(); if (sc.error()) { - RDC_LOG(RDC_ERROR, - "Failed to relinquish capabilities. Errno " << sc.error()); + RDC_LOG(RDC_ERROR, "Failed to relinquish capabilities. Errno " << sc.error()); return RDC_ST_PERM_ERROR; } if (ret == RSMI_STATUS_SUCCESS) { gpu_evnt_notif_masks_[it->first] = it->second; - RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << - "is set to 0x" << std::hex << it->second); + RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << "is set to 0x" + << std::hex << it->second); } else { - RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret - << " for device " << it->first); + RDC_LOG(RDC_INFO, + "rsmi_event_notification_mask_set() returned " << ret << " for device " << it->first); return Rsmi2RdcError(ret); } } @@ -139,9 +129,8 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { } // Blocking -rdc_status_t -RdcNotificationImpl::listen(rdc_evnt_notification_t *events, - uint32_t *num_events, uint32_t timeout_ms) { +rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32_t* num_events, + uint32_t timeout_ms) { if (events == nullptr || *num_events == 0) { return RDC_ST_BAD_PARAMETER; } @@ -149,40 +138,37 @@ RdcNotificationImpl::listen(rdc_evnt_notification_t *events, uint32_t f_cnt = std::min(*num_events, kMaxRSMIEvents); rsmi_evt_notification_data_t rsmi_events[kMaxRSMIEvents]; - rsmi_status_t ret = - rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events); + rsmi_status_t ret = rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events); if (ret != RSMI_STATUS_SUCCESS) { return Rsmi2RdcError(ret); } - struct timeval tv; + struct timeval tv; gettimeofday(&tv, NULL); - uint64_t now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; *num_events = f_cnt; for (uint32_t i = 0; i < f_cnt; ++i) { assert(rsmi_event_notif_2_rdc_map.find(rsmi_events[i].event) != - rsmi_event_notif_2_rdc_map.end()); + rsmi_event_notif_2_rdc_map.end()); events[i].gpu_id = rsmi_events[i].dv_ind; events[i].field.field_id = rsmi_event_notif_2_rdc_map[rsmi_events[i].event]; events[i].field.status = RDC_ST_OK; events[i].field.ts = now; events[i].field.type = STRING; - strncpy_with_null(events[i].field.value.str, - rsmi_events[i].message, RDC_MAX_STR_LENGTH); + strncpy_with_null(events[i].field.value.str, rsmi_events[i].message, RDC_MAX_STR_LENGTH); } return RDC_ST_OK; } -rdc_status_t -RdcNotificationImpl::stop_listening(uint32_t gpu_id) { +rdc_status_t RdcNotificationImpl::stop_listening(uint32_t gpu_id) { rsmi_status_t ret; ret = rsmi_event_notification_mask_set(gpu_id, 0); if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret - << " for device " << gpu_id); + RDC_LOG(RDC_ERROR, + "rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id); } ret = rsmi_event_notification_stop(gpu_id); @@ -190,12 +176,11 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) { std::lock_guard guard(notif_mutex_); gpu_evnt_notif_masks_[gpu_id] = 0; } else { - RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret - << " for device " << gpu_id); + RDC_LOG(RDC_ERROR, + "rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id); } return RDC_ST_OK; } - } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcPerfTimer.cc b/projects/rdc/rdc_libs/rdc/src/RdcPerfTimer.cc index a77e3f1fb3..47277bcc56 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcPerfTimer.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcPerfTimer.cc @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include "rdc_lib/RdcPerfTimer.h" + #include namespace amd { @@ -28,9 +29,7 @@ namespace rdc { static const uint64_t kNanosecondsPerSecond = 1000000000; -RdcPerfTimer::RdcPerfTimer(void) { - freq_in_100mhz = MeasureTSCFreqHz(); -} +RdcPerfTimer::RdcPerfTimer(void) { freq_in_100mhz = MeasureTSCFreqHz(); } RdcPerfTimer::~RdcPerfTimer() { while (!_timers.empty()) { @@ -62,8 +61,7 @@ int RdcPerfTimer::StartTimer(int index) { #ifndef _AMD struct timespec s; clock_gettime(CLOCK_MONOTONIC, &s); - _timers[index]->_start = (uint64_t) s.tv_sec * kNanosecondsPerSecond - + (uint64_t) s.tv_nsec; + _timers[index]->_start = (uint64_t)s.tv_sec * kNanosecondsPerSecond + (uint64_t)s.tv_nsec; #else // AMD timing method @@ -88,7 +86,7 @@ int RdcPerfTimer::StopTimer(int index) { #ifndef _AMD struct timespec s; clock_gettime(CLOCK_MONOTONIC, &s); - n = (uint64_t) s.tv_sec * kNanosecondsPerSecond + (uint64_t) s.tv_nsec; + n = (uint64_t)s.tv_sec * kNanosecondsPerSecond + (uint64_t)s.tv_nsec; #else // AMD Linux timing @@ -110,9 +108,7 @@ int RdcPerfTimer::StopTimer(int index) { return 0; } -void RdcPerfTimer::Error(std::string str) { - std::cout << str << std::endl; -} +void RdcPerfTimer::Error(std::string str) { std::cout << str << std::endl; } double RdcPerfTimer::ReadTimer(int index) { if (index >= static_cast(_timers.size())) { diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc index f3e311e956..4e0b85a77e 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc @@ -19,176 +19,159 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcRasLib.h" +#include + +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" + namespace amd { namespace rdc { -RdcRasLib::RdcRasLib(): - fields_value_get_(nullptr) - , fields_query_(nullptr) - , fields_watch_(nullptr) - , fields_unwatch_(nullptr) - , rdc_module_init_(nullptr) - , rdc_module_destroy_(nullptr) { - rdc_status_t status = lib_loader_.load("librdc_ras.so"); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "RAS related function will not work."); - return; - } +RdcRasLib::RdcRasLib() + : fields_value_get_(nullptr), + fields_query_(nullptr), + fields_watch_(nullptr), + fields_unwatch_(nullptr), + rdc_module_init_(nullptr), + rdc_module_destroy_(nullptr) { + rdc_status_t status = lib_loader_.load("librdc_ras.so"); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "RAS related function will not work."); + return; + } - status = lib_loader_.load_symbol(&rdc_module_init_, - "rdc_module_init"); - if (status != RDC_ST_OK) { - rdc_module_init_ = nullptr; - return; - } + status = lib_loader_.load_symbol(&rdc_module_init_, "rdc_module_init"); + if (status != RDC_ST_OK) { + rdc_module_init_ = nullptr; + return; + } - status = rdc_module_init_(0); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Fail to init librdc_ras.so:" - << rdc_status_string(status) - << ". RAS related function will not work."); - return; - } + status = rdc_module_init_(0); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Fail to init librdc_ras.so:" << rdc_status_string(status) + << ". RAS related function will not work."); + return; + } + status = lib_loader_.load_symbol(&rdc_module_destroy_, "rdc_module_destroy"); + if (status != RDC_ST_OK) { + rdc_module_destroy_ = nullptr; + } - status = lib_loader_.load_symbol(&rdc_module_destroy_, - "rdc_module_destroy"); - if (status != RDC_ST_OK) { - rdc_module_destroy_ = nullptr; - } - - status = lib_loader_.load_symbol(&fields_value_get_, - "rdc_telemetry_fields_value_get"); - if (status != RDC_ST_OK) { - fields_value_get_ = nullptr; - } - status = lib_loader_.load_symbol(&fields_query_, - "rdc_telemetry_fields_query"); - if (status != RDC_ST_OK) { - fields_query_ = nullptr; - } - status = lib_loader_.load_symbol(&fields_watch_, - "rdc_telemetry_fields_watch"); - if (status != RDC_ST_OK) { - fields_watch_ = nullptr; - } - status = lib_loader_.load_symbol(&fields_unwatch_, - "rdc_telemetry_fields_unwatch"); - if (status != RDC_ST_OK) { - fields_unwatch_ = nullptr; - } + status = lib_loader_.load_symbol(&fields_value_get_, "rdc_telemetry_fields_value_get"); + if (status != RDC_ST_OK) { + fields_value_get_ = nullptr; + } + status = lib_loader_.load_symbol(&fields_query_, "rdc_telemetry_fields_query"); + if (status != RDC_ST_OK) { + fields_query_ = nullptr; + } + status = lib_loader_.load_symbol(&fields_watch_, "rdc_telemetry_fields_watch"); + if (status != RDC_ST_OK) { + fields_watch_ = nullptr; + } + status = lib_loader_.load_symbol(&fields_unwatch_, "rdc_telemetry_fields_unwatch"); + if (status != RDC_ST_OK) { + fields_unwatch_ = nullptr; + } } RdcRasLib::~RdcRasLib() { - if (rdc_module_destroy_) { - rdc_module_destroy_(); - } + if (rdc_module_destroy_) { + rdc_module_destroy_(); + } } -rdc_status_t RdcRasLib::rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) { - if (field_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (!fields_query_) { - *field_count = 0; - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t RdcRasLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!fields_query_) { + *field_count = 0; + return RDC_ST_FAIL_LOAD_MODULE; + } - auto status = fields_query_(field_ids, field_count); - RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields"); - return status; + auto status = fields_query_(field_ids, field_count); + RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields"); + return status; } -rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get( - rdc_gpu_field_t* fields, uint32_t fields_count, rdc_field_value_f callback, - void* user_data) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (!fields_value_get_) { - return RDC_ST_FAIL_LOAD_MODULE; - } - rdc_status_t status = fields_value_get_(fields, - fields_count, callback, user_data); - RDC_LOG(RDC_DEBUG, "Bulk fetched " << fields_count << " fields from RAS: " - << rdc_status_string(status)); - return status; +rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, + rdc_field_value_f callback, + void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!fields_value_get_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + rdc_status_t status = fields_value_get_(fields, fields_count, callback, user_data); + RDC_LOG(RDC_DEBUG, + "Bulk fetched " << fields_count << " fields from RAS: " << rdc_status_string(status)); + return status; } -rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (!fields_watch_) { - return RDC_ST_FAIL_LOAD_MODULE; - } - rdc_status_t status = fields_watch_(fields, fields_count); - RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: " - << rdc_status_string(status)); - return status; +rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!fields_watch_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + rdc_status_t status = fields_watch_(fields, fields_count); + RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: " << rdc_status_string(status)); + return status; } rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (!fields_unwatch_) { - return RDC_ST_FAIL_LOAD_MODULE; - } - rdc_status_t status = fields_unwatch_(fields, fields_count); - RDC_LOG(RDC_DEBUG, "Unwatch " << fields_count << " fields from RAS: " - << rdc_status_string(status)); - return status; + uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!fields_unwatch_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + rdc_status_t status = fields_unwatch_(fields, fields_count); + RDC_LOG(RDC_DEBUG, + "Unwatch " << fields_count << " fields from RAS: " << rdc_status_string(status)); + return status; } - -rdc_status_t RdcRasLib::rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) { - (void)test_cases; - (void)test_case_count; - return RDC_ST_NOT_SUPPORTED; +rdc_status_t RdcRasLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + (void)test_cases; + (void)test_case_count; + return RDC_ST_NOT_SUPPORTED; } - // Run a specific test case -rdc_status_t RdcRasLib::rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - (void)test_case; - (void)gpu_index; - (void)result; - (void)gpu_count; - return RDC_ST_NOT_SUPPORTED; +// Run a specific test case +rdc_status_t RdcRasLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, rdc_diag_test_result_t* result) { + (void)test_case; + (void)gpu_index; + (void)result; + (void)gpu_count; + return RDC_ST_NOT_SUPPORTED; } -rdc_status_t RdcRasLib::rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) { - (void)gpus; - (void)level; - (void)response; - return RDC_ST_NOT_SUPPORTED; +rdc_status_t RdcRasLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) { + (void)gpus; + (void)level; + (void)response; + return RDC_ST_NOT_SUPPORTED; } rdc_status_t RdcRasLib::rdc_diag_init(uint64_t flags) { - (void)flags; - return RDC_ST_NOT_SUPPORTED; -} -rdc_status_t RdcRasLib::rdc_diag_destroy() { - return RDC_ST_NOT_SUPPORTED; + (void)flags; + return RDC_ST_NOT_SUPPORTED; } +rdc_status_t RdcRasLib::rdc_diag_destroy() { return RDC_ST_NOT_SUPPORTED; } } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc index f6617ffef6..e082c82757 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRocpLib.cc @@ -40,168 +40,156 @@ RdcRocpLib::RdcRocpLib(const char* lib_name) telemetry_fields_value_get_(nullptr), telemetry_fields_watch_(nullptr), telemetry_fields_unwatch_(nullptr) { - rdc_status_t status = lib_loader_.load(lib_name); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Rocp related function will not work."); - return; - } + rdc_status_t status = lib_loader_.load(lib_name); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Rocp related function will not work."); + return; + } - status = set_rocmtools_path(); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Rocp related function will not work."); - return; - } + status = set_rocmtools_path(); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Rocp related function will not work."); + return; + } - status = lib_loader_.load_symbol( - &telemetry_fields_query_, "rdc_telemetry_fields_query"); - if (status != RDC_ST_OK) { - telemetry_fields_query_ = nullptr; - } + status = lib_loader_.load_symbol(&telemetry_fields_query_, "rdc_telemetry_fields_query"); + if (status != RDC_ST_OK) { + telemetry_fields_query_ = nullptr; + } - status = lib_loader_.load_symbol( - &telemetry_fields_value_get_, "rdc_telemetry_fields_value_get"); - if (status != RDC_ST_OK) { - telemetry_fields_value_get_ = nullptr; - } + status = lib_loader_.load_symbol(&telemetry_fields_value_get_, "rdc_telemetry_fields_value_get"); + if (status != RDC_ST_OK) { + telemetry_fields_value_get_ = nullptr; + } - status = lib_loader_.load_symbol( - &telemetry_fields_watch_, "rdc_telemetry_fields_watch"); - if (status != RDC_ST_OK) { - telemetry_fields_watch_ = nullptr; - } + status = lib_loader_.load_symbol(&telemetry_fields_watch_, "rdc_telemetry_fields_watch"); + if (status != RDC_ST_OK) { + telemetry_fields_watch_ = nullptr; + } - status = lib_loader_.load_symbol( - &telemetry_fields_unwatch_, "rdc_telemetry_fields_unwatch"); - if (status != RDC_ST_OK) { - telemetry_fields_unwatch_ = nullptr; - } + status = lib_loader_.load_symbol(&telemetry_fields_unwatch_, "rdc_telemetry_fields_unwatch"); + if (status != RDC_ST_OK) { + telemetry_fields_unwatch_ = nullptr; + } } RdcRocpLib::~RdcRocpLib() = default; // get support field ids -rdc_status_t RdcRocpLib::rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count) { - if (field_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (telemetry_fields_query_ == nullptr) { - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (telemetry_fields_query_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } - return telemetry_fields_query_(field_ids, field_count); + return telemetry_fields_query_(field_ids, field_count); } // Fetch -rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get( - rdc_gpu_field_t* fields, - uint32_t fields_count, - rdc_field_value_f callback, - void* user_data) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (telemetry_fields_value_get_ == nullptr) { - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, + rdc_field_value_f callback, + void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (telemetry_fields_value_get_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } - return telemetry_fields_value_get_( - fields, fields_count, callback, user_data); + return telemetry_fields_value_get_(fields, fields_count, callback, user_data); } -rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch( - rdc_gpu_field_t* fields, - uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (telemetry_fields_watch_ == nullptr) { - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (telemetry_fields_watch_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } - return telemetry_fields_watch_(fields, fields_count); + return telemetry_fields_watch_(fields, fields_count); } -rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch( - rdc_gpu_field_t* fields, - uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (telemetry_fields_unwatch_ == nullptr) { - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (telemetry_fields_unwatch_ == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } - return telemetry_fields_unwatch_(fields, fields_count); + return telemetry_fields_unwatch_(fields, fields_count); } std::string RdcRocpLib::get_rocm_path() { - // set default rocm path in case lookup fails - std::string rocm_path("/opt/rocm"); - const char* rocm_path_env = getenv("ROCM_PATH"); - if (rocm_path_env != nullptr) { - rocm_path = rocm_path_env; - } + // set default rocm path in case lookup fails + std::string rocm_path("/opt/rocm"); + const char* rocm_path_env = getenv("ROCM_PATH"); + if (rocm_path_env != nullptr) { + rocm_path = rocm_path_env; + } - std::ifstream file("/proc/self/maps"); - - if (!file.is_open()) { - return rocm_path; - } - - std::string line; - while (getline(file, line)) { - size_t index_end = line.find("librocmtools.so"); - size_t index_start = index_end; - if (index_end == std::string::npos) { - // no library on this line - continue; - } - // walk index backwards until it reaches a space - while ((index_start > 0) && (line[index_start - 1] != ' ')) { - index_start--; - } - // extract library path, drop library name - rocm_path = line.substr(index_start, index_end - index_start); - // appending "../" should result in "/opt/rocm/lib/.." or similar - rocm_path += ".."; - return rocm_path; - } + std::ifstream file("/proc/self/maps"); + if (!file.is_open()) { return rocm_path; + } + + std::string line; + while (getline(file, line)) { + size_t index_end = line.find("librocmtools.so"); + size_t index_start = index_end; + if (index_end == std::string::npos) { + // no library on this line + continue; + } + // walk index backwards until it reaches a space + while ((index_start > 0) && (line[index_start - 1] != ' ')) { + index_start--; + } + // extract library path, drop library name + rocm_path = line.substr(index_start, index_end - index_start); + // appending "../" should result in "/opt/rocm/lib/.." or similar + rocm_path += ".."; + return rocm_path; + } + + return rocm_path; } rdc_status_t RdcRocpLib::set_rocmtools_path() { - // librocmtools requires ROCMTOOLS_METRICS_PATH to be set - std::string rocmtools_metrics_path = - get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml"; + // librocmtools requires ROCMTOOLS_METRICS_PATH to be set + std::string rocmtools_metrics_path = + get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml"; - // set rocm prefix - int result = - setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0); - if (result != 0) { - RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result); - return RDC_ST_PERM_ERROR; - } + // set rocm prefix + int result = setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0); + if (result != 0) { + RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result); + return RDC_ST_PERM_ERROR; + } - // check that env exists - const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH"); - if (rocmtools_metrics_env == nullptr) { - RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!"); - return RDC_ST_NO_DATA; - } + // check that env exists + const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH"); + if (rocmtools_metrics_env == nullptr) { + RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!"); + return RDC_ST_NO_DATA; + } - // check that file can be accessed - std::ifstream test_file(rocmtools_metrics_env); - if (!test_file.good()) { - RDC_LOG( - RDC_ERROR, - "failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env); - return RDC_ST_FILE_ERROR; - } + // check that file can be accessed + std::ifstream test_file(rocmtools_metrics_env); + if (!test_file.good()) { + RDC_LOG(RDC_ERROR, "failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env); + return RDC_ST_FILE_ERROR; + } - return RDC_ST_OK; + return RDC_ST_OK; } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc index 7a78b59571..0744b97a30 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc @@ -19,125 +19,114 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcRocrLib.h" +#include + +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" + namespace amd { namespace rdc { -RdcRocrLib::RdcRocrLib(): - test_case_run_(nullptr) - , diag_test_cases_query_(nullptr) - , diag_init_(nullptr) - , diag_destroy_(nullptr) { - rdc_status_t status = lib_loader_.load("librdc_rocr.so"); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Rocr related function will not work."); - return; - } +RdcRocrLib::RdcRocrLib() + : test_case_run_(nullptr), + diag_test_cases_query_(nullptr), + diag_init_(nullptr), + diag_destroy_(nullptr) { + rdc_status_t status = lib_loader_.load("librdc_rocr.so"); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Rocr related function will not work."); + return; + } - status = lib_loader_.load_symbol(&diag_init_, - "rdc_diag_init"); - if (status != RDC_ST_OK) { - diag_init_ = nullptr; - return; - } + status = lib_loader_.load_symbol(&diag_init_, "rdc_diag_init"); + if (status != RDC_ST_OK) { + diag_init_ = nullptr; + return; + } - status = diag_init_(0); - if (status != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Fail to init librdc_rocr.so:" - << rdc_status_string(status) - << ". Rocr related function will not work."); - return; - } + status = diag_init_(0); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Fail to init librdc_rocr.so:" << rdc_status_string(status) + << ". Rocr related function will not work."); + return; + } - status = lib_loader_.load_symbol(&diag_destroy_, - "rdc_diag_destroy"); - if (status != RDC_ST_OK) { - diag_destroy_ = nullptr; - } + status = lib_loader_.load_symbol(&diag_destroy_, "rdc_diag_destroy"); + if (status != RDC_ST_OK) { + diag_destroy_ = nullptr; + } - status = lib_loader_.load_symbol(&test_case_run_, - "rdc_diag_test_case_run"); - if (status != RDC_ST_OK) { - test_case_run_ = nullptr; - } - status = lib_loader_.load_symbol(&diag_test_cases_query_, - "rdc_diag_test_cases_query"); - if (status != RDC_ST_OK) { - diag_test_cases_query_ = nullptr; - } + status = lib_loader_.load_symbol(&test_case_run_, "rdc_diag_test_case_run"); + if (status != RDC_ST_OK) { + test_case_run_ = nullptr; + } + status = lib_loader_.load_symbol(&diag_test_cases_query_, "rdc_diag_test_cases_query"); + if (status != RDC_ST_OK) { + diag_test_cases_query_ = nullptr; + } } RdcRocrLib::~RdcRocrLib() { - if (diag_destroy_) { - diag_destroy_(); - } + if (diag_destroy_) { + diag_destroy_(); + } } -rdc_status_t RdcRocrLib::rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) { - if (test_case_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (!diag_test_cases_query_) { - return RDC_ST_FAIL_LOAD_MODULE; - } +rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!diag_test_cases_query_) { + return RDC_ST_FAIL_LOAD_MODULE; + } - rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count); - RDC_LOG(RDC_DEBUG, "Query " << *test_case_count << " test cases from Rocr: " - << rdc_status_string(status)); - return status; + rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count); + RDC_LOG(RDC_DEBUG, + "Query " << *test_case_count << " test cases from Rocr: " << rdc_status_string(status)); + return status; } - // Run a specific test case -rdc_status_t RdcRocrLib::rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - if (!test_case_run_) { - return RDC_ST_FAIL_LOAD_MODULE; - } +// Run a specific test case +rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!test_case_run_) { + return RDC_ST_FAIL_LOAD_MODULE; + } - rdc_status_t status = test_case_run_(test_case, gpu_index, - gpu_count, result); - RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " - << rdc_status_string(status)); - return status; + rdc_status_t status = test_case_run_(test_case, gpu_index, gpu_count, result); + RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " << rdc_status_string(status)); + return status; } -rdc_status_t RdcRocrLib::rdc_diagnostic_run( - const rdc_group_info_t& gpus, - rdc_diag_level_t level, - rdc_diag_response_t* response) { - (void)gpus; - (void)level; - (void)response; - return RDC_ST_NOT_SUPPORTED; +rdc_status_t RdcRocrLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + rdc_diag_response_t* response) { + (void)gpus; + (void)level; + (void)response; + return RDC_ST_NOT_SUPPORTED; } rdc_status_t RdcRocrLib::rdc_diag_init(uint64_t flags) { - if (!diag_init_) { - return RDC_ST_FAIL_LOAD_MODULE; - } + if (!diag_init_) { + return RDC_ST_FAIL_LOAD_MODULE; + } - return diag_init_(flags); + return diag_init_(flags); } rdc_status_t RdcRocrLib::rdc_diag_destroy() { - if (!diag_destroy_) { - return RDC_ST_FAIL_LOAD_MODULE; - } + if (!diag_destroy_) { + return RDC_ST_FAIL_LOAD_MODULE; + } - return diag_destroy_(); + return diag_destroy_(); } } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc index 204276bd38..69472db791 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc @@ -20,560 +20,516 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcSmiDiagnosticImpl.h" -#include + #include #include -#include "rdc_lib/rdc_common.h" +#include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RsmiUtils.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() { -} +RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() {} -rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info( - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - *result = {}; - result->test_case = RDC_DIAG_COMPUTE_PROCESS; - result->status = RDC_DIAG_RESULT_SKIP; +rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + *result = {}; + result->test_case = RDC_DIAG_COMPUTE_PROCESS; + result->status = RDC_DIAG_RESULT_SKIP; + result->per_gpu_result_count = 0; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + uint32_t num_items = 0; + err = rsmi_compute_process_info_get(nullptr, &num_items); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Fail to get process information: " << err); + strncpy_with_null(result->info, "Fail to retreive process information from rocm_smi_lib", + MAX_DIAG_MSG_LENGTH); + return Rsmi2RdcError(err); + } + + // No process found + if (num_items == 0) { + result->status = RDC_DIAG_RESULT_PASS; result->per_gpu_result_count = 0; - rsmi_status_t err = RSMI_STATUS_SUCCESS; - uint32_t num_items = 0; - err = rsmi_compute_process_info_get( - nullptr, &num_items); - if (err != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "Fail to get process information: " << err); - strncpy_with_null(result->info, - "Fail to retreive process information from rocm_smi_lib", - MAX_DIAG_MSG_LENGTH); - return Rsmi2RdcError(err); - } - - // No process found - if (num_items == 0) { - result->status = RDC_DIAG_RESULT_PASS; - result->per_gpu_result_count = 0; - strncpy_with_null(result->info, - "No processes running on any devices.", - MAX_DIAG_MSG_LENGTH); - return RDC_ST_OK; - } - - std::string info; - // Find details of the process running on each GPU - std::vector procs(num_items); - err = rsmi_compute_process_info_get( - reinterpret_cast(&procs[0]), &num_items); - if (err != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, - "Fail to get process detail information: " << err); - strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); - return Rsmi2RdcError(err); - } - - std::map> pids_per_gpu; - for (uint32_t i=0; i < num_items; i++) { - // Skip the process does not occupy any GPUs. The hsa_shutdown() - // will not clear /proc sys file until the process is terminated. - if (procs[i].cu_occupancy == 0 ) continue; - info += " Process: " + std::to_string(procs[i].process_id) - += ", pasid: " + std::to_string(procs[i].pasid) - += ", vram_usage: " + std::to_string(procs[i].vram_usage) - += ", sdma_usage: " + std::to_string(procs[i].sdma_usage) - += ", cu_occupancy: " + std::to_string(procs[i].cu_occupancy) - +="."; - - // Get the num_devices the process is running - uint32_t num_devices = 0; - err = rsmi_compute_process_gpus_get(procs[i].process_id, - nullptr, &num_devices); - if (err != RSMI_STATUS_SUCCESS || num_devices == 0) { - RDC_LOG(RDC_INFO, - "Fail to get process GPUs detail information: " << err); - continue; - } - - // Get the details of devices - std::vector device_details(num_devices); - err = rsmi_compute_process_gpus_get(procs[i].process_id, - reinterpret_cast(&device_details[0]), &num_devices); - if (err != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, - "Fail to get process GPUs detail information: " << err); - continue; - } - - // Add process information in per GPU structure - for (uint32_t j=0; j < num_devices; j++) { - // return the value if exists - auto ite = pids_per_gpu.insert(std::pair>(device_details[j], - std::vector())); - ite.first->second.push_back(procs[i].process_id); - } - } // end for (uint32_t i=0 ...) - - result->status = RDC_DIAG_RESULT_PASS; // pass by default - if (pids_per_gpu.size() == 0) { - result->per_gpu_result_count = 0; - strncpy_with_null(result->info, - "No processes running on any devices.", - MAX_DIAG_MSG_LENGTH); - return RDC_ST_OK; - } - - // Mark as fail - for (uint32_t i=0; i < gpu_count; i++) { - if (pids_per_gpu.find(gpu_index[i]) != pids_per_gpu.end()) { - result->status = RDC_DIAG_RESULT_FAIL; - break; - } - } - - // Set per GPU information - strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); - for (auto ite=pids_per_gpu.begin(); ite != pids_per_gpu.end(); ++ite) { - auto& per_gpu = result->gpu_results[result->per_gpu_result_count]; - per_gpu.gpu_index = ite->first; - per_gpu.gpu_result.code = 0; - std::string per_gpu_msg = "Running process:"; - for (uint32_t k=0; k < ite->second.size(); k++) { - per_gpu_msg += " " + std::to_string(ite->second[k]); - } - - strncpy_with_null(per_gpu.gpu_result.msg, - per_gpu_msg.c_str(), MAX_DIAG_MSG_LENGTH); - - result->per_gpu_result_count++; - if (result->per_gpu_result_count >= RDC_MAX_NUM_DEVICES) { - RDC_LOG(RDC_ERROR, "Found more GPUs than " << RDC_MAX_NUM_DEVICES); - break; - } - } - + strncpy_with_null(result->info, "No processes running on any devices.", MAX_DIAG_MSG_LENGTH); return RDC_ST_OK; + } + + std::string info; + // Find details of the process running on each GPU + std::vector procs(num_items); + err = + rsmi_compute_process_info_get(reinterpret_cast(&procs[0]), &num_items); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Fail to get process detail information: " << err); + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + return Rsmi2RdcError(err); + } + + std::map> pids_per_gpu; + for (uint32_t i = 0; i < num_items; i++) { + // Skip the process does not occupy any GPUs. The hsa_shutdown() + // will not clear /proc sys file until the process is terminated. + if (procs[i].cu_occupancy == 0) continue; + info += " Process: " + std::to_string(procs[i].process_id) += + ", pasid: " + std::to_string(procs[i].pasid) += + ", vram_usage: " + std::to_string(procs[i].vram_usage) += + ", sdma_usage: " + std::to_string(procs[i].sdma_usage) += + ", cu_occupancy: " + std::to_string(procs[i].cu_occupancy) += "."; + + // Get the num_devices the process is running + uint32_t num_devices = 0; + err = rsmi_compute_process_gpus_get(procs[i].process_id, nullptr, &num_devices); + if (err != RSMI_STATUS_SUCCESS || num_devices == 0) { + RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err); + continue; + } + + // Get the details of devices + std::vector device_details(num_devices); + err = rsmi_compute_process_gpus_get( + procs[i].process_id, reinterpret_cast(&device_details[0]), &num_devices); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err); + continue; + } + + // Add process information in per GPU structure + for (uint32_t j = 0; j < num_devices; j++) { + // return the value if exists + auto ite = pids_per_gpu.insert( + std::pair>(device_details[j], std::vector())); + ite.first->second.push_back(procs[i].process_id); + } + } // end for (uint32_t i=0 ...) + + result->status = RDC_DIAG_RESULT_PASS; // pass by default + if (pids_per_gpu.size() == 0) { + result->per_gpu_result_count = 0; + strncpy_with_null(result->info, "No processes running on any devices.", MAX_DIAG_MSG_LENGTH); + return RDC_ST_OK; + } + + // Mark as fail + for (uint32_t i = 0; i < gpu_count; i++) { + if (pids_per_gpu.find(gpu_index[i]) != pids_per_gpu.end()) { + result->status = RDC_DIAG_RESULT_FAIL; + break; + } + } + + // Set per GPU information + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + for (auto ite = pids_per_gpu.begin(); ite != pids_per_gpu.end(); ++ite) { + auto& per_gpu = result->gpu_results[result->per_gpu_result_count]; + per_gpu.gpu_index = ite->first; + per_gpu.gpu_result.code = 0; + std::string per_gpu_msg = "Running process:"; + for (uint32_t k = 0; k < ite->second.size(); k++) { + per_gpu_msg += " " + std::to_string(ite->second[k]); + } + + strncpy_with_null(per_gpu.gpu_result.msg, per_gpu_msg.c_str(), MAX_DIAG_MSG_LENGTH); + + result->per_gpu_result_count++; + if (result->per_gpu_result_count >= RDC_MAX_NUM_DEVICES) { + RDC_LOG(RDC_ERROR, "Found more GPUs than " << RDC_MAX_NUM_DEVICES); + break; + } + } + + return RDC_ST_OK; } -std::string RdcSmiDiagnosticImpl::get_temperature_string( - rsmi_temperature_type_t type) const { - switch (type) { - case RSMI_TEMP_TYPE_EDGE: - return "Edge"; - case RSMI_TEMP_TYPE_JUNCTION: - return "Junction"; - case RSMI_TEMP_TYPE_MEMORY: - return "Memory"; - default: - return "Unknown"; - } +std::string RdcSmiDiagnosticImpl::get_temperature_string(rsmi_temperature_type_t type) const { + switch (type) { + case RSMI_TEMP_TYPE_EDGE: + return "Edge"; + case RSMI_TEMP_TYPE_JUNCTION: + return "Junction"; + case RSMI_TEMP_TYPE_MEMORY: + return "Memory"; + default: + return "Unknown"; + } } -std::string RdcSmiDiagnosticImpl::get_voltage_string( - rsmi_voltage_type_t type) const { - switch (type) { - case RSMI_VOLT_TYPE_VDDGFX: - return "Vddgfx voltage"; - default: - return "Unknown"; - } +std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) const { + switch (type) { + case RSMI_VOLT_TYPE_VDDGFX: + return "Vddgfx voltage"; + default: + return "Unknown"; + } } // Show topology type -rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info( - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr) { - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + *result = {}; + result->test_case = RDC_DIAG_NODE_TOPOLOGY; + + const std::map link_to_string = { + {RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"}, + {RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"}, + {RSMI_IOLINK_TYPE_XGMI, "XGMI"}, + {RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}}; + + result->status = RDC_DIAG_RESULT_SKIP; + result->per_gpu_result_count = 0; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + std::string info = ""; + + for (uint32_t i = 0; i < gpu_count; i++) { + for (uint32_t j = 0; j < gpu_count; j++) { + if (gpu_index[i] == gpu_index[j]) continue; + + uint64_t weight; + err = rsmi_topo_get_link_weight(gpu_index[i], gpu_index[j], &weight); + if (err != RSMI_STATUS_SUCCESS) { + result->status = RDC_DIAG_RESULT_FAIL; + result->details.code = err; + std::string err_info = "rsmi_topo_get_link_weight("; + err_info += std::to_string(gpu_index[i]) + ","; + err_info += std::to_string(gpu_index[j]) + ", &weight)"; + err_info += " fail"; + strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH); + strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH); + return RDC_ST_MSI_ERROR; + } + + info += std::to_string(gpu_index[i]) + "=>"; + info += std::to_string(gpu_index[j]) + " weight:"; + info += std::to_string(weight) + " "; } - *result = {}; - result->test_case = RDC_DIAG_NODE_TOPOLOGY; + } + if (info != "") { + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + } else { + strncpy_with_null(result->info, "No link detected.", MAX_DIAG_MSG_LENGTH); + } - const std::map link_to_string = { - {RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"}, - {RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"}, - {RSMI_IOLINK_TYPE_XGMI, "XGMI"}, - {RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"} - }; - - result->status = RDC_DIAG_RESULT_SKIP; - result->per_gpu_result_count = 0; - rsmi_status_t err = RSMI_STATUS_SUCCESS; - std::string info = ""; - - for (uint32_t i=0; i < gpu_count; i++) { - for (uint32_t j=0; j < gpu_count; j++) { - if (gpu_index[i] == gpu_index[j]) continue; - - uint64_t weight; - err = rsmi_topo_get_link_weight( - gpu_index[i], gpu_index[j], &weight); - if (err != RSMI_STATUS_SUCCESS) { - result->status = RDC_DIAG_RESULT_FAIL; - result->details.code = err; - std::string err_info = "rsmi_topo_get_link_weight("; - err_info += std::to_string(gpu_index[i]) + ","; - err_info += std::to_string(gpu_index[j]) + ", &weight)"; - err_info += " fail"; - strncpy_with_null(result->details.msg, err_info.c_str(), - MAX_DIAG_MSG_LENGTH); - strncpy_with_null(result->info, err_info.c_str(), - MAX_DIAG_MSG_LENGTH); - return RDC_ST_MSI_ERROR; - } - - info += std::to_string(gpu_index[i]) + "=>"; - info += std::to_string(gpu_index[j]) + " weight:"; - info += std::to_string(weight) + " "; - } - } - if (info != "") { - strncpy_with_null(result->info, info.c_str(), - MAX_DIAG_MSG_LENGTH); - } else { - strncpy_with_null(result->info, "No link detected.", - MAX_DIAG_MSG_LENGTH); - } - - result->status = RDC_DIAG_RESULT_PASS; - return RDC_ST_OK; + result->status = RDC_DIAG_RESULT_PASS; + return RDC_ST_OK; } -rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info( - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr) { - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + *result = {}; + result->test_case = RDC_DIAG_GPU_PARAMETERS; + + result->status = RDC_DIAG_RESULT_PASS; + std::string info = ""; + + for (uint32_t i = 0; i < gpu_count; i++) { + // temperature + for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST; + sensor_type != RSMI_TEMP_TYPE_LAST;) { + auto status = check_temperature_level(gpu_index[i], sensor_type, result->info, + result->gpu_results[i].gpu_result.msg); + // Set to higher error level + if (status > result->status) { + result->status = status; + } + sensor_type = static_cast(sensor_type + 1); } - *result = {}; - result->test_case = RDC_DIAG_GPU_PARAMETERS; - result->status = RDC_DIAG_RESULT_PASS; - std::string info = ""; - - for (uint32_t i=0; i < gpu_count; i++) { - // temperature - for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST; - sensor_type != RSMI_TEMP_TYPE_LAST; ) { - auto status = check_temperature_level( - gpu_index[i], sensor_type, result->info, - result->gpu_results[i].gpu_result.msg); - // Set to higher error level - if (status > result->status) { - result->status = status; - } - sensor_type = static_cast(sensor_type+1); - } - - // Voltage - for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST; - sensor_type != RSMI_VOLT_TYPE_LAST;) { - auto status = check_voltage_level( - gpu_index[i], sensor_type, result->info, - result->gpu_results[i].gpu_result.msg); - // Set to higher error level - if (status > result->status) { - result->status = status; - } - sensor_type = static_cast(sensor_type+1); - } - result->gpu_results->gpu_index = gpu_index[i]; - result->per_gpu_result_count++; + // Voltage + for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST; + sensor_type != RSMI_VOLT_TYPE_LAST;) { + auto status = check_voltage_level(gpu_index[i], sensor_type, result->info, + result->gpu_results[i].gpu_result.msg); + // Set to higher error level + if (status > result->status) { + result->status = status; + } + sensor_type = static_cast(sensor_type + 1); } - return RDC_ST_OK; + result->gpu_results->gpu_index = gpu_index[i]; + result->per_gpu_result_count++; + } + return RDC_ST_OK; } rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( - uint32_t gpu_index, rsmi_temperature_type_t type - , char msg[MAX_DIAG_MSG_LENGTH] - , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { - rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; - rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT; - rsmi_status_t err = RSMI_STATUS_SUCCESS; - int64_t current_temp = 0; - std::string info = msg; - std::string per_gpu_info = per_gpu_msg; + uint32_t gpu_index, rsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH], + char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { + rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; + rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + int64_t current_temp = 0; + std::string info = msg; + std::string per_gpu_info = per_gpu_msg; - err = rsmi_dev_temp_metric_get(gpu_index, - type, met, ¤t_temp); + err = rsmi_dev_temp_metric_get(gpu_index, type, met, ¤t_temp); - if (err != RSMI_STATUS_SUCCESS) return result; + if (err != RSMI_STATUS_SUCCESS) return result; - // Max temperature - met = RSMI_TEMP_MAX; - int64_t max_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, - type, met, &max_temp); - if (err == RSMI_STATUS_SUCCESS) { - if (current_temp >= max_temp) { - result = RDC_DIAG_RESULT_WARN; - per_gpu_info += "Max "; - per_gpu_info += get_temperature_string(type); - per_gpu_info += " temperature "; - per_gpu_info += std::to_string(max_temp); - per_gpu_info += " greater than current temperature "; - per_gpu_info += std::to_string(current_temp) +". "; - info += get_temperature_string(type) + ": "; - info += "GPU " + std::to_string(gpu_index); - info += " max "; - info += get_temperature_string(type); - info += " temperature exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " max "; - info += get_temperature_string(type); - info += " temperature in range. "; - } + // Max temperature + met = RSMI_TEMP_MAX; + int64_t max_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, type, met, &max_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp >= max_temp) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Max "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(max_temp); + per_gpu_info += " greater than current temperature "; + per_gpu_info += std::to_string(current_temp) + ". "; + info += get_temperature_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_temperature_string(type); + info += " temperature in range. "; } + } - met = RSMI_TEMP_MIN; - int64_t min_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, - type, met, &min_temp); - if (err == RSMI_STATUS_SUCCESS) { - if (current_temp <= min_temp) { - result = RDC_DIAG_RESULT_WARN; - per_gpu_info += "Min "; - per_gpu_info += get_temperature_string(type); - per_gpu_info += " temperature "; - per_gpu_info += std::to_string(min_temp); - per_gpu_info += " less than current temperature "; - per_gpu_info += std::to_string(current_temp) +". "; - info += "GPU " + std::to_string(gpu_index); - info += " min "; - info += get_temperature_string(type); - info += " temperature exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " min "; - info += get_temperature_string(type); - info += " temperature in range. "; - } + met = RSMI_TEMP_MIN; + int64_t min_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, type, met, &min_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp <= min_temp) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Min "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(min_temp); + per_gpu_info += " less than current temperature "; + per_gpu_info += std::to_string(current_temp) + ". "; + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_temperature_string(type); + info += " temperature in range. "; } + } - met = RSMI_TEMP_CRITICAL; - int64_t critical_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, - type, met, &critical_temp); - if (err == RSMI_STATUS_SUCCESS) { - if (current_temp >= critical_temp) { - result = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Critical "; - per_gpu_info += get_temperature_string(type); - per_gpu_info += " temperature "; - per_gpu_info += std::to_string(critical_temp); - per_gpu_info += " greater than current temperature "; - per_gpu_info += std::to_string(current_temp) +". "; - info += "GPU " + std::to_string(gpu_index); - info += " Critical "; - info += get_temperature_string(type); - info += " temperature exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " Critical "; - info += get_temperature_string(type); - info += " temperature in range. "; - } + met = RSMI_TEMP_CRITICAL; + int64_t critical_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp >= critical_temp) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(critical_temp); + per_gpu_info += " greater than current temperature "; + per_gpu_info += std::to_string(current_temp) + ". "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical "; + info += get_temperature_string(type); + info += " temperature in range. "; } + } - met = RSMI_TEMP_EMERGENCY; - int64_t emergency_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, - type, met, &emergency_temp); - if (err == RSMI_STATUS_SUCCESS) { - if (current_temp >= critical_temp) { - result = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Emergency "; - per_gpu_info += get_temperature_string(type); - per_gpu_info += " temperature "; - per_gpu_info += std::to_string(emergency_temp); - per_gpu_info += " greater than current temperature "; - per_gpu_info += std::to_string(current_temp) +". "; - info += "GPU " + std::to_string(gpu_index); - info += " Emergency "; - info += get_temperature_string(type); - info += " temperature exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " Emergency "; - info += get_temperature_string(type); - info += " temperature in range. "; - } + met = RSMI_TEMP_EMERGENCY; + int64_t emergency_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, type, met, &emergency_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp >= critical_temp) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Emergency "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(emergency_temp); + per_gpu_info += " greater than current temperature "; + per_gpu_info += std::to_string(current_temp) + ". "; + info += "GPU " + std::to_string(gpu_index); + info += " Emergency "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Emergency "; + info += get_temperature_string(type); + info += " temperature in range. "; } + } - met = RSMI_TEMP_CRIT_MIN; - int64_t critical_min_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, - type, met, &critical_min_temp); - if (err == RSMI_STATUS_SUCCESS) { - if (current_temp <= critical_min_temp) { - result = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Critical Min "; - per_gpu_info += get_temperature_string(type); - per_gpu_info += " temperature "; - per_gpu_info += std::to_string(critical_min_temp); - per_gpu_info += " less than current temperature "; - per_gpu_info += std::to_string(current_temp) +". "; - info += "GPU " + std::to_string(gpu_index); - info += " Critical Min "; - info += get_temperature_string(type); - info += " temperature exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " Critical Min "; - info += get_temperature_string(type); - info += " temperature in range. "; - } + met = RSMI_TEMP_CRIT_MIN; + int64_t critical_min_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_min_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp <= critical_min_temp) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical Min "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(critical_min_temp); + per_gpu_info += " less than current temperature "; + per_gpu_info += std::to_string(current_temp) + ". "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical Min "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical Min "; + info += get_temperature_string(type); + info += " temperature in range. "; } + } - strncpy_with_null(msg, info.c_str(), - MAX_DIAG_MSG_LENGTH); - strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(), - MAX_DIAG_MSG_LENGTH); + strncpy_with_null(msg, info.c_str(), MAX_DIAG_MSG_LENGTH); + strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(), MAX_DIAG_MSG_LENGTH); - return result; + return result; } +rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index, + rsmi_voltage_type_t type, + char msg[MAX_DIAG_MSG_LENGTH], + char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { + rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; + rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + int64_t current_voltage = 0; + std::string info = msg; + std::string per_gpu_info = per_gpu_msg; -rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level( - uint32_t gpu_index, rsmi_voltage_type_t type - , char msg[MAX_DIAG_MSG_LENGTH] - , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { - rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; - rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT; - rsmi_status_t err = RSMI_STATUS_SUCCESS; - int64_t current_voltage = 0; - std::string info = msg; - std::string per_gpu_info = per_gpu_msg; + err = rsmi_dev_volt_metric_get(gpu_index, type, met, ¤t_voltage); + if (err != RSMI_STATUS_SUCCESS) return result; - err = rsmi_dev_volt_metric_get(gpu_index, - type, met, ¤t_voltage); - if (err != RSMI_STATUS_SUCCESS) return result; - - // Max voltage - met = RSMI_VOLT_MAX; - int64_t max_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, - type, met, &max_volt); - if (err == RSMI_STATUS_SUCCESS) { - if (current_voltage >= max_volt) { - result = RDC_DIAG_RESULT_WARN; - per_gpu_info += "Max "; - per_gpu_info += get_voltage_string(type); - per_gpu_info += " voltage "; - per_gpu_info += std::to_string(max_volt); - per_gpu_info += " greater than current voltage "; - per_gpu_info += std::to_string(current_voltage) +". "; - info += get_voltage_string(type) + ": "; - info += "GPU " + std::to_string(gpu_index); - info += " max "; - info += get_voltage_string(type); - info += " voltage exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " max "; - info += get_voltage_string(type); - info += " voltage in range. "; - } + // Max voltage + met = RSMI_VOLT_MAX; + int64_t max_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, type, met, &max_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage >= max_volt) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Max "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(max_volt); + per_gpu_info += " greater than current voltage "; + per_gpu_info += std::to_string(current_voltage) + ". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_voltage_string(type); + info += " voltage in range. "; } + } - // Min voltage - met = RSMI_VOLT_MIN; - int64_t min_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, - type, met, &min_volt); - if (err == RSMI_STATUS_SUCCESS) { - if (current_voltage <= min_volt) { - result = RDC_DIAG_RESULT_WARN; - per_gpu_info += "Min "; - per_gpu_info += get_voltage_string(type); - per_gpu_info += " voltage "; - per_gpu_info += std::to_string(min_volt); - per_gpu_info += " less than current voltage "; - per_gpu_info += std::to_string(current_voltage) +". "; - info += get_voltage_string(type) + ": "; - info += "GPU " + std::to_string(gpu_index); - info += " min "; - info += get_voltage_string(type); - info += " voltage exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " min "; - info += get_voltage_string(type); - info += " voltage in range. "; - } + // Min voltage + met = RSMI_VOLT_MIN; + int64_t min_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, type, met, &min_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage <= min_volt) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Min "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(min_volt); + per_gpu_info += " less than current voltage "; + per_gpu_info += std::to_string(current_voltage) + ". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_voltage_string(type); + info += " voltage in range. "; } + } - // Max Critical voltage - met = RSMI_VOLT_MAX_CRIT; - int64_t critical_max_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, - type, met, &critical_max_volt); - if (err == RSMI_STATUS_SUCCESS) { - if (current_voltage >= critical_max_volt) { - result = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Critical Max "; - per_gpu_info += get_voltage_string(type); - per_gpu_info += " voltage "; - per_gpu_info += std::to_string(critical_max_volt); - per_gpu_info += " greater than current voltage "; - per_gpu_info += std::to_string(current_voltage) +". "; - info += get_voltage_string(type) + ": "; - info += "GPU " + std::to_string(gpu_index); - info += " Critical max "; - info += get_voltage_string(type); - info += " voltage exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " Critical max "; - info += get_voltage_string(type); - info += " voltage in range. "; - } + // Max Critical voltage + met = RSMI_VOLT_MAX_CRIT; + int64_t critical_max_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_max_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage >= critical_max_volt) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical Max "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(critical_max_volt); + per_gpu_info += " greater than current voltage "; + per_gpu_info += std::to_string(current_voltage) + ". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical max "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical max "; + info += get_voltage_string(type); + info += " voltage in range. "; } + } - // Min Critical voltage - met = RSMI_VOLT_MIN_CRIT; - int64_t critical_min_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, - type, met, &critical_min_volt); - if (err == RSMI_STATUS_SUCCESS) { - if (current_voltage <= critical_min_volt) { - result = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Critical Min "; - per_gpu_info += get_voltage_string(type); - per_gpu_info += " voltage "; - per_gpu_info += std::to_string(critical_min_volt); - per_gpu_info += " less than current voltage "; - per_gpu_info += std::to_string(current_voltage) +". "; - info += get_voltage_string(type) + ": "; - info += "GPU " + std::to_string(gpu_index); - info += " Critical min "; - info += get_voltage_string(type); - info += " voltage exceeds. "; - } else { - info += "GPU " + std::to_string(gpu_index); - info += " Critical min "; - info += get_voltage_string(type); - info += " voltage in range. "; - } + // Min Critical voltage + met = RSMI_VOLT_MIN_CRIT; + int64_t critical_min_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_min_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage <= critical_min_volt) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical Min "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(critical_min_volt); + per_gpu_info += " less than current voltage "; + per_gpu_info += std::to_string(current_voltage) + ". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical min "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical min "; + info += get_voltage_string(type); + info += " voltage in range. "; } + } - strncpy_with_null(msg, info.c_str(), - MAX_DIAG_MSG_LENGTH); - strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(), - MAX_DIAG_MSG_LENGTH); + strncpy_with_null(msg, info.c_str(), MAX_DIAG_MSG_LENGTH); + strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(), MAX_DIAG_MSG_LENGTH); - return result; + return result; } } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 7ce450070a..24d16a6ac3 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -19,220 +19,199 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include -#include "rdc_lib/rdc_common.h" -#include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcSmiLib.h" +#include +#include +#include + +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf) - , bulk_fetch_enabled_(false) - , smi_diag_(std::make_shared()) { - char* bulk_env = getenv("RDC_BULK_FETCH_ENABLED"); - if (bulk_env != nullptr && strcasecmp(bulk_env, "true") == 0) { - RDC_LOG(RDC_DEBUG, "Bulk fetch enabled."); - bulk_fetch_enabled_ = true; - } else { - RDC_LOG(RDC_DEBUG, "Bulk fetch disabled."); - } +RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf) + : metric_fetcher_(mf), + bulk_fetch_enabled_(false), + smi_diag_(std::make_shared()) { + char* bulk_env = getenv("RDC_BULK_FETCH_ENABLED"); + if (bulk_env != nullptr && strcasecmp(bulk_env, "true") == 0) { + RDC_LOG(RDC_DEBUG, "Bulk fetch enabled."); + bulk_fetch_enabled_ = true; + } else { + RDC_LOG(RDC_DEBUG, "Bulk fetch disabled."); + } } // Bulk fetch wrapper for the rocm_smi_lib. This will be replaced after // rocm_smi_lib can support bulk fetch. rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, - uint32_t fields_count, rdc_field_value_f callback, - void* user_data) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } + uint32_t fields_count, + rdc_field_value_f callback, + void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - RDC_LOG(RDC_DEBUG, "Fetch " << fields_count - << " fields from rocm_smi_lib."); + RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocm_smi_lib."); - // Bulk fetch fields - std::vector bulk_results; - if (bulk_fetch_enabled_) { - rdc_status_t status = metric_fetcher_->bulk_fetch_smi_fields( - fields, fields_count, bulk_results); - RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size() - << " fields from rocm_smi_lib which return " << status); - if (bulk_results.size() > 0) { - rdc_status_t status = callback(&bulk_results[0], - bulk_results.size(), user_data); - if (status != RDC_ST_OK) { - return status; - } - } + // Bulk fetch fields + std::vector bulk_results; + if (bulk_fetch_enabled_) { + rdc_status_t status = + metric_fetcher_->bulk_fetch_smi_fields(fields, fields_count, bulk_results); + RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size() + << " fields from rocm_smi_lib which return " << status); + if (bulk_results.size() > 0) { + rdc_status_t status = callback(&bulk_results[0], bulk_results.size(), user_data); + if (status != RDC_ST_OK) { + return status; + } } + } - // Fetch it one by one for left fields - const int BULK_FIELDS_MAX = 16; - rdc_gpu_field_value_t values[BULK_FIELDS_MAX]; - uint32_t bulk_count = 0; - for (uint32_t i = 0; i < fields_count; i++) { - bool is_fetched = false; - for (std::size_t j = 0; j < bulk_results.size(); j++) { - if (bulk_results[j].gpu_index == fields[i].gpu_index && - bulk_results[j].field_value.field_id == fields[i].field_id) { - is_fetched = true; - break; - } - } - if (is_fetched) continue; - if (bulk_count >= BULK_FIELDS_MAX) { - rdc_status_t status = callback(values, bulk_count, user_data); - // When the callback returns errors, stop processing and return. - if (status != RDC_ST_OK) { - return status; - } - bulk_count = 0; - } - values[bulk_count].gpu_index = fields[i].gpu_index; - metric_fetcher_->fetch_smi_field( - fields[i].gpu_index, - static_cast(fields[i].field_id), - &(values[bulk_count].field_value)); - bulk_count++; + // Fetch it one by one for left fields + const int BULK_FIELDS_MAX = 16; + rdc_gpu_field_value_t values[BULK_FIELDS_MAX]; + uint32_t bulk_count = 0; + for (uint32_t i = 0; i < fields_count; i++) { + bool is_fetched = false; + for (std::size_t j = 0; j < bulk_results.size(); j++) { + if (bulk_results[j].gpu_index == fields[i].gpu_index && + bulk_results[j].field_value.field_id == fields[i].field_id) { + is_fetched = true; + break; + } } - if (bulk_count != 0) { - rdc_status_t status = callback(values, bulk_count, user_data); - if (status != RDC_ST_OK) { - return status; - } - bulk_count = 0; + if (is_fetched) continue; + if (bulk_count >= BULK_FIELDS_MAX) { + rdc_status_t status = callback(values, bulk_count, user_data); + // When the callback returns errors, stop processing and return. + if (status != RDC_ST_OK) { + return status; + } + bulk_count = 0; } + values[bulk_count].gpu_index = fields[i].gpu_index; + metric_fetcher_->fetch_smi_field(fields[i].gpu_index, + static_cast(fields[i].field_id), + &(values[bulk_count].field_value)); + bulk_count++; + } + if (bulk_count != 0) { + rdc_status_t status = callback(values, bulk_count, user_data); + if (status != RDC_ST_OK) { + return status; + } + bulk_count = 0; + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, - uint32_t fields_count) { +rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { rdc_status_t ret; - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - for (uint32_t i = 0; i < fields_count; i++) { - ret = metric_fetcher_->acquire_rsmi_handle( - {fields[i].gpu_index, fields[i].field_id}); - if (ret != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, - "Failed to acquire rocm_smi handle for field."); - } + for (uint32_t i = 0; i < fields_count; i++) { + ret = metric_fetcher_->acquire_rsmi_handle({fields[i].gpu_index, fields[i].field_id}); + if (ret != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Failed to acquire rocm_smi handle for field."); } - RDC_LOG(RDC_DEBUG, "acquire " << fields_count - << " field handles from rocm_smi_lib"); + } + RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib"); - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, - uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } + uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - for (uint32_t i = 0; i < fields_count; i++) { - metric_fetcher_->delete_rsmi_handle( - {fields[i].gpu_index, fields[i].field_id}); - } - RDC_LOG(RDC_DEBUG, "delete " << fields_count - << " field handles from rocm_smi_lib"); + for (uint32_t i = 0; i < fields_count; i++) { + metric_fetcher_->delete_rsmi_handle({fields[i].gpu_index, fields[i].field_id}); + } + RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from rocm_smi_lib"); - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcSmiLib::rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count) { - if (field_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - // List of fields supported by rocm_smi_lib - const std::vector fields{ - RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, RDC_FI_GPU_CLOCK, - RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP, - RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, - RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, - RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, - RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, - RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX, - RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX, - RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX, - RDC_EVNT_XGMI_0_THRPUT, RDC_EVNT_XGMI_1_THRPUT, - RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT, - RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, - }; - std::copy(fields.begin(), fields.end(), field_ids); - *field_count = fields.size(); + // List of fields supported by rocm_smi_lib + const std::vector fields{ + RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, + RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK, + RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP, + RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, + RDC_FI_PCIE_RX, RDC_FI_GPU_UTIL, + RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, + RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, + RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, + RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX, + RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX, + RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX, + RDC_EVNT_XGMI_0_THRPUT, RDC_EVNT_XGMI_1_THRPUT, + RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT, + RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, + }; + std::copy(fields.begin(), fields.end(), field_ids); + *field_count = fields.size(); - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcSmiLib::rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) { - if (test_case_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - const std::vector tests { - RDC_DIAG_COMPUTE_PROCESS, - RDC_DIAG_NODE_TOPOLOGY, - RDC_DIAG_GPU_PARAMETERS - }; - std::copy(tests.begin(), tests.end(), test_cases); - *test_case_count = tests.size(); - return RDC_ST_OK; + const std::vector tests{RDC_DIAG_COMPUTE_PROCESS, RDC_DIAG_NODE_TOPOLOGY, + RDC_DIAG_GPU_PARAMETERS}; + std::copy(tests.begin(), tests.end(), test_cases); + *test_case_count = tests.size(); + return RDC_ST_OK; } // Run a specific test case -rdc_status_t RdcSmiLib::rdc_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - switch (test_case) { - case RDC_DIAG_COMPUTE_PROCESS: - return smi_diag_->check_rsmi_process_info( - gpu_index, gpu_count, result); - case RDC_DIAG_NODE_TOPOLOGY: - return smi_diag_->check_rsmi_topo_info( - gpu_index, gpu_count, result); - case RDC_DIAG_GPU_PARAMETERS: - return smi_diag_->check_rsmi_param_info( - gpu_index, gpu_count, result); - default: - return RDC_ST_NOT_SUPPORTED; - } +rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + switch (test_case) { + case RDC_DIAG_COMPUTE_PROCESS: + return smi_diag_->check_rsmi_process_info(gpu_index, gpu_count, result); + case RDC_DIAG_NODE_TOPOLOGY: + return smi_diag_->check_rsmi_topo_info(gpu_index, gpu_count, result); + case RDC_DIAG_GPU_PARAMETERS: + return smi_diag_->check_rsmi_param_info(gpu_index, gpu_count, result); + default: + return RDC_ST_NOT_SUPPORTED; + } } -rdc_status_t RdcSmiLib::rdc_diagnostic_run( - const rdc_group_info_t&, - rdc_diag_level_t, - rdc_diag_response_t*) { - return RDC_ST_NOT_SUPPORTED; +rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t, + rdc_diag_response_t*) { + return RDC_ST_NOT_SUPPORTED; } -rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) { - return RDC_ST_OK; -} +rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) { return RDC_ST_OK; } -rdc_status_t RdcSmiLib::rdc_diag_destroy() { - return RDC_ST_OK; -} +rdc_status_t RdcSmiLib::rdc_diag_destroy() { return RDC_ST_OK; } } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc b/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc index e05990fb88..1d97e10736 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc @@ -20,8 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcTelemetryModule.h" + #include #include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/impl/RdcRasLib.h" @@ -31,140 +33,128 @@ namespace amd { namespace rdc { // Return all supported fields -rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) { - if (field_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } - auto ite = telemetry_modules_.begin(); - *field_count = 0; - for (; ite != telemetry_modules_.end(); ite++) { - uint32_t count = 0; - rdc_status_t status = (*ite)->rdc_telemetry_fields_query( - &(field_ids[*field_count]), &count); - if (status == RDC_ST_OK) { - *field_count += count; - } +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = telemetry_modules_.begin(); + *field_count = 0; + for (; ite != telemetry_modules_.end(); ite++) { + uint32_t count = 0; + rdc_status_t status = (*ite)->rdc_telemetry_fields_query(&(field_ids[*field_count]), &count); + if (status == RDC_ST_OK) { + *field_count += count; } + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch( - rdc_gpu_field_t* fields, uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + std::map> fields_in_module; + std::vector unsupport_fields; + get_fields_for_module(fields, fields_count, fields_in_module, unsupport_fields); + + auto ite = fields_in_module.begin(); + for (; ite != fields_in_module.end(); ite++) { + if (ite->second.size() > 0) { + ite->first->rdc_telemetry_fields_watch(&ite->second[0], ite->second.size()); } + } - std::map> fields_in_module; - std::vector unsupport_fields; - get_fields_for_module(fields, fields_count, - fields_in_module, unsupport_fields); - - auto ite = fields_in_module.begin(); - for (; ite != fields_in_module.end(); ite++) { - if (ite->second.size() > 0) { - ite->first->rdc_telemetry_fields_watch( - &ite->second[0], ite->second.size()); - } - } - - return RDC_ST_OK; + return RDC_ST_OK; } +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } -rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch( - rdc_gpu_field_t* fields, uint32_t fields_count) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; + std::map> fields_in_module; + std::vector unsupport_fields; + get_fields_for_module(fields, fields_count, fields_in_module, unsupport_fields); + + auto ite = fields_in_module.begin(); + for (; ite != fields_in_module.end(); ite++) { + if (ite->second.size() > 0) { + ite->first->rdc_telemetry_fields_unwatch(&ite->second[0], ite->second.size()); } + } - std::map> fields_in_module; - std::vector unsupport_fields; - get_fields_for_module(fields, fields_count, - fields_in_module, unsupport_fields); - - auto ite = fields_in_module.begin(); - for (; ite != fields_in_module.end(); ite++) { - if (ite->second.size() > 0) { - ite->first->rdc_telemetry_fields_unwatch( - &ite->second[0], ite->second.size()); - } - } - - return RDC_ST_OK; + return RDC_ST_OK; } RdcTelemetryModule::RdcTelemetryModule(RdcMetricFetcherPtr fetcher) { - const RdcSmiLibPtr smi_module = std::make_shared(fetcher); - const RdcRasLibPtr ras_module = std::make_shared(); - telemetry_modules_.push_back(smi_module); - if (ras_module) { - telemetry_modules_.push_back(ras_module); - } + const RdcSmiLibPtr smi_module = std::make_shared(fetcher); + const RdcRasLibPtr ras_module = std::make_shared(); + telemetry_modules_.push_back(smi_module); + if (ras_module) { + telemetry_modules_.push_back(ras_module); + } - auto ite = telemetry_modules_.begin(); - for (; ite != telemetry_modules_.end(); ite++) { - uint32_t field_ids[MAX_NUM_FIELDS]; - uint32_t field_count = 0; - rdc_status_t status = (*ite)-> - rdc_telemetry_fields_query(field_ids, &field_count); - if (status == RDC_ST_OK) { - for (uint32_t index = 0; index < field_count; index++) { - fields_id_module_.insert({field_ids[index], (*ite)}); - } - } + auto ite = telemetry_modules_.begin(); + for (; ite != telemetry_modules_.end(); ite++) { + uint32_t field_ids[MAX_NUM_FIELDS]; + uint32_t field_count = 0; + rdc_status_t status = (*ite)->rdc_telemetry_fields_query(field_ids, &field_count); + if (status == RDC_ST_OK) { + for (uint32_t index = 0; index < field_count; index++) { + fields_id_module_.insert({field_ids[index], (*ite)}); + } } + } } void RdcTelemetryModule::get_fields_for_module( - rdc_gpu_field_t* fields, - uint32_t fields_count, - std::map>& - fields_in_module, - std::vector& unsupport_fields) { - for (uint32_t findex = 0; findex < fields_count; findex++) { - RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id]; - if (module) { - fields_in_module[module].push_back(fields[findex]); - } else { - RDC_LOG(RDC_DEBUG, "Unsupported field " << - field_id_string(fields[findex].field_id)); - rdc_gpu_field_value_t value; - value.gpu_index = fields[findex].gpu_index; - value.field_value.field_id = fields[findex].field_id; - value.field_value.status = RDC_ST_NOT_SUPPORTED; - unsupport_fields.push_back(value); - } + rdc_gpu_field_t* fields, uint32_t fields_count, + std::map>& fields_in_module, + std::vector& unsupport_fields) { + for (uint32_t findex = 0; findex < fields_count; findex++) { + RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id]; + if (module) { + fields_in_module[module].push_back(fields[findex]); + } else { + RDC_LOG(RDC_DEBUG, "Unsupported field " << field_id_string(fields[findex].field_id)); + rdc_gpu_field_value_t value; + value.gpu_index = fields[findex].gpu_index; + value.field_value.field_id = fields[findex].field_id; + value.field_value.status = RDC_ST_NOT_SUPPORTED; + unsupport_fields.push_back(value); } + } } +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, + rdc_field_value_f callback, + void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } -rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get( - rdc_gpu_field_t* fields, uint32_t fields_count, - rdc_field_value_f callback, void* user_data) { - if (fields == nullptr) { - return RDC_ST_BAD_PARAMETER; - } + // Dispatch the fields to the libraries + std::map> fields_to_fetch; + std::vector unsupport_fields; + get_fields_for_module(fields, fields_count, fields_to_fetch, unsupport_fields); - // Dispatch the fields to the libraries - std::map> fields_to_fetch; - std::vector unsupport_fields; - get_fields_for_module(fields, fields_count, - fields_to_fetch, unsupport_fields); + auto ite = fields_to_fetch.begin(); + for (; ite != fields_to_fetch.end(); ite++) { + rdc_gpu_field_t f[MAX_NUM_FIELDS]; + std::copy(ite->second.begin(), ite->second.end(), f); + ite->first->rdc_telemetry_fields_value_get(f, ite->second.size(), callback, user_data); + } - auto ite = fields_to_fetch.begin(); - for (; ite != fields_to_fetch.end(); ite ++) { - rdc_gpu_field_t f[MAX_NUM_FIELDS]; - std::copy(ite->second.begin(), ite->second.end(), f); - ite->first->rdc_telemetry_fields_value_get(f, - ite->second.size(), callback, user_data); - } + // Notify the caller unsupported fields + callback(&unsupport_fields[0], unsupport_fields.size(), user_data); - // Notify the caller unsupported fields - callback(&unsupport_fields[0], unsupport_fields.size(), user_data); - - return RDC_ST_OK; + return RDC_ST_OK; } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 93cbbecad9..b4c549da1d 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -20,482 +20,450 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include -#include -#include -#include #include "rdc_lib/impl/RdcWatchTableImpl.h" + +#include + +#include +#include +#include +#include +#include + +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcMetricFetcherImpl.h" #include "rdc_lib/rdc_common.h" -#include "common/rdc_utils.h" -#include "rdc_lib/RdcLogger.h" -#include "rdc/rdc.h" namespace amd { namespace rdc { RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, - const RdcCacheManagerPtr& cache_mgr, - const RdcModuleMgrPtr& module_mgr, - const RdcNotificationPtr& notif): - group_settings_(group_settings) - , cache_mgr_(cache_mgr) - , rdc_module_mgr_(module_mgr) - , notifications_(notif) - , last_cleanup_time_(0) { -} + const RdcCacheManagerPtr& cache_mgr, + const RdcModuleMgrPtr& module_mgr, + const RdcNotificationPtr& notif) + : group_settings_(group_settings), + cache_mgr_(cache_mgr), + rdc_module_mgr_(module_mgr), + notifications_(notif), + last_cleanup_time_(0) {} -rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, - const char job_id[64], uint64_t update_freq, - const rdc_gpu_gauges_t& gpu_gauges) { - do { //< lock guard for thread safe - std::lock_guard guard(watch_mutex_); - if (job_watch_table_.find(job_id) != job_watch_table_.end()) { - return RDC_ST_ALREADY_EXIST; - } - } while (0); - - std::vector fields_in_watch; - rdc_status_t result = get_fields_from_group(group_id, - JOB_FIELD_ID, fields_in_watch); - if (result != RDC_ST_OK) { - return result; - } - if (fields_in_watch.size() == 0) { - RDC_LOG(RDC_ERROR, "Fail to start job " << job_id <<". The group " - << group_id << " must contain at least one GPU."); - return RDC_ST_NOT_FOUND; +rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64], + uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauges) { + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + if (job_watch_table_.find(job_id) != job_watch_table_.end()) { + return RDC_ST_ALREADY_EXIST; } + } while (0); - JobWatchTableEntry jentry {group_id, fields_in_watch}; - do { //< lock guard for thread safe - std::lock_guard guard(watch_mutex_); - job_watch_table_.insert({job_id, jentry}); - } while (0); - - - rdc_field_group_info_t finfo; - rdc_group_info_t ginfo; - result = group_settings_->rdc_group_gpu_get_info( - group_id, &ginfo); - if (result != RDC_ST_OK) { - return result; - } - - result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo); - if (result != RDC_ST_OK) { - return result; - } - - result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges); - if (result != RDC_ST_OK) { - return result; - } - - // At last, when every thing sets up, starts to watch the fields. - result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); + std::vector fields_in_watch; + rdc_status_t result = get_fields_from_group(group_id, JOB_FIELD_ID, fields_in_watch); + if (result != RDC_ST_OK) { return result; + } + if (fields_in_watch.size() == 0) { + RDC_LOG(RDC_ERROR, "Fail to start job " << job_id << ". The group " << group_id + << " must contain at least one GPU."); + return RDC_ST_NOT_FOUND; + } + + JobWatchTableEntry jentry{group_id, fields_in_watch}; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + job_watch_table_.insert({job_id, jentry}); + } while (0); + + rdc_field_group_info_t finfo; + rdc_group_info_t ginfo; + result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) { + return result; + } + + result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo); + if (result != RDC_ST_OK) { + return result; + } + + result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges); + if (result != RDC_ST_OK) { + return result; + } + + // At last, when every thing sets up, starts to watch the fields. + result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); + return result; } rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(const char job_id[64], - const rdc_gpu_gauges_t& gpu_gauge) { - uint32_t job_group_id; - do { //< lock guard for thread safe - std::lock_guard guard(watch_mutex_); - auto job = job_watch_table_.find(job_id); - if (job == job_watch_table_.end()) { - return RDC_ST_NOT_FOUND; - } - job_group_id = job->second.group_id; - } while (0); - - rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID); - if (result != RDC_ST_OK) { - return result; + const rdc_gpu_gauges_t& gpu_gauge) { + uint32_t job_group_id; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + auto job = job_watch_table_.find(job_id); + if (job == job_watch_table_.end()) { + return RDC_ST_NOT_FOUND; } + job_group_id = job->second.group_id; + } while (0); - do { //< lock guard for thread safe - std::lock_guard guard(watch_mutex_); - job_watch_table_.erase(job_id); - } while (0); - - result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge); - + rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID); + if (result != RDC_ST_OK) { return result; + } + + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + job_watch_table_.erase(job_id); + } while (0); + + result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge); + + return result; } rdc_status_t RdcWatchTableImpl::rdc_job_remove(const char job_id[64]) { - rdc_gpu_gauges_t gpu_gauge; - rdc_job_stop_stats(job_id, gpu_gauge); - return cache_mgr_->rdc_job_remove(job_id); + rdc_gpu_gauges_t gpu_gauge; + rdc_job_stop_stats(job_id, gpu_gauge); + return cache_mgr_->rdc_job_remove(job_id); } rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() { - // Get all the job ids; - std::vector v; - do { //< lock guard for thread safe - std::lock_guard guard(watch_mutex_); - for (auto ite = job_watch_table_.begin(); - ite != job_watch_table_.end(); ite++) { - v.push_back(ite->first); - } - } while (0); - - // Stop them - for (auto job = v.begin(); job != v.end(); job++) { - rdc_gpu_gauges_t gpu_gauge; - rdc_job_stop_stats(const_cast(job->c_str()), gpu_gauge); + // Get all the job ids; + std::vector v; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + for (auto ite = job_watch_table_.begin(); ite != job_watch_table_.end(); ite++) { + v.push_back(ite->first); } + } while (0); - return cache_mgr_->rdc_job_remove_all(); + // Stop them + for (auto job = v.begin(); job != v.end(); job++) { + rdc_gpu_gauges_t gpu_gauge; + rdc_job_stop_stats(const_cast(job->c_str()), gpu_gauge); + } + + return cache_mgr_->rdc_job_remove_all(); } - rdc_status_t RdcWatchTableImpl::get_fields_from_group(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, std::vector & fields) { - rdc_field_group_info_t finfo; - rdc_group_info_t ginfo; - rdc_status_t result = group_settings_-> - rdc_group_gpu_get_info(group_id, &ginfo); - if (result != RDC_ST_OK) { - return result; - } + rdc_field_grp_t field_group_id, + std::vector& fields) { + rdc_field_group_info_t finfo; + rdc_group_info_t ginfo; + rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) { + return result; + } - result = group_settings_->rdc_group_field_get_info(field_group_id, &finfo); - if (result != RDC_ST_OK) { - return result; - } + result = group_settings_->rdc_group_field_get_info(field_group_id, &finfo); + if (result != RDC_ST_OK) { + return result; + } - for (uint32_t i = 0 ; i < ginfo.count; i++) { // GPUs - for (uint32_t j = 0; j < finfo.count; j++) { // Fields - fields.push_back(RdcFieldKey({ginfo.entity_ids[i], - finfo.field_ids[j]})); - } + for (uint32_t i = 0; i < ginfo.count; i++) { // GPUs + for (uint32_t j = 0; j < finfo.count; j++) { // Fields + fields.push_back(RdcFieldKey({ginfo.entity_ids[i], finfo.field_ids[j]})); } + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) { - std::lock_guard guard(watch_mutex_); - RdcFieldGroupKey gkey({group_id, field_group_id}); - auto table_iter = watch_table_.find(gkey); + rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) { + std::lock_guard guard(watch_mutex_); + RdcFieldGroupKey gkey({group_id, field_group_id}); + auto table_iter = watch_table_.find(gkey); - // Already in the watch table - if (table_iter != watch_table_.end()) { - if (table_iter->second.is_watching) { - return RDC_ST_CONFLICT; - } else { // delete to overwrite - watch_table_.erase(table_iter); - } + // Already in the watch table + if (table_iter != watch_table_.end()) { + if (table_iter->second.is_watching) { + return RDC_ST_CONFLICT; + } else { // delete to overwrite + watch_table_.erase(table_iter); } + } - // The field settings for this watch - FieldSettings f; - f.update_freq = update_freq; - f.max_keep_age = max_keep_age; - f.max_keep_samples = max_keep_samples; - f.last_update_time = 0; - f.is_watching = true; + // The field settings for this watch + FieldSettings f; + f.update_freq = update_freq; + f.max_keep_age = max_keep_age; + f.max_keep_samples = max_keep_samples; + f.last_update_time = 0; + f.is_watching = true; + // Get individual fields for the watch + std::vector fields_in_watch; + rdc_status_t result = get_fields_from_group(group_id, field_group_id, fields_in_watch); + if (result != RDC_ST_OK) { + return result; + } - // Get individual fields for the watch - std::vector fields_in_watch; - rdc_status_t result = get_fields_from_group(group_id, - field_group_id, fields_in_watch); - if (result != RDC_ST_OK) { - return result; - } - - // See if any of the fields are notification fields, and - // set them up, if so. - result = notifications_->set_listen_events(fields_in_watch); - if (result != RDC_ST_OK) { - RDC_LOG(RDC_DEBUG, - "Error in configuring for event notification. Return " << result); - } - // Skip not supported fields - uint32_t unsupported_fields = 0; - auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); - if (rdc_telemetry) { - uint32_t field_ids[MAX_NUM_FIELDS]; - uint32_t field_count; - result = rdc_telemetry-> - rdc_telemetry_fields_query(field_ids, &field_count); - if (result == RDC_ST_OK) { - RDC_LOG(RDC_DEBUG, "The system support " - << field_count << " fields"); - for (auto it = fields_in_watch.begin(); - it != fields_in_watch.end(); ) { - bool not_supported = true; - for (uint32_t fi = 0; fi < field_count; fi++) { - if (field_ids[fi] == it->second) { - not_supported = false; - break; - } - } - if (not_supported) { - if (!notifications_->is_notification_event(it->second)) { - unsupported_fields++; - } - it = fields_in_watch.erase(it); - } else { - it++; - } - } // end for - } // end if - } - if (unsupported_fields > 0) { - RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields - <<" fields as they are not supported."); - } - - // Update the fields_to_watch_ - auto f_in_watch_iter = fields_in_watch.begin(); - - for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) { - auto ite = fields_to_watch_.find(*f_in_watch_iter); - if (ite == fields_to_watch_.end()) { // A new field - fields_to_watch_.insert({*f_in_watch_iter, f}); - } else { // Merge the settings - auto& f_in_table = ite->second; - f_in_table.max_keep_age = - std::max(f_in_table.max_keep_age, max_keep_age); - f_in_table.max_keep_samples = - std::max(f_in_table.max_keep_samples, max_keep_samples); - if (f_in_table.is_watching) { // Already watching - f_in_table.update_freq = - std::min(f_in_table.update_freq, update_freq); - } else { // Not watching before - f_in_table.is_watching = true; - f_in_table.update_freq = update_freq; + // See if any of the fields are notification fields, and + // set them up, if so. + result = notifications_->set_listen_events(fields_in_watch); + if (result != RDC_ST_OK) { + RDC_LOG(RDC_DEBUG, "Error in configuring for event notification. Return " << result); + } + // Skip not supported fields + uint32_t unsupported_fields = 0; + auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); + if (rdc_telemetry) { + uint32_t field_ids[MAX_NUM_FIELDS]; + uint32_t field_count; + result = rdc_telemetry->rdc_telemetry_fields_query(field_ids, &field_count); + if (result == RDC_ST_OK) { + RDC_LOG(RDC_DEBUG, "The system support " << field_count << " fields"); + for (auto it = fields_in_watch.begin(); it != fields_in_watch.end();) { + bool not_supported = true; + for (uint32_t fi = 0; fi < field_count; fi++) { + if (field_ids[fi] == it->second) { + not_supported = false; + break; } - } - } - - // Add to the watch table - watch_table_.insert({gkey, f}); - - // Notify the telemetry_module to watch those fields - if (rdc_telemetry) { - std::vector fields; - auto fields_to_watch_iter = fields_to_watch_.begin(); - for (; fields_to_watch_iter != fields_to_watch_.end(); - fields_to_watch_iter++) { - if (fields_to_watch_iter->second.is_watching) { - fields.push_back({fields_to_watch_iter->first.first, - fields_to_watch_iter->first.second}); - } } - rdc_telemetry->rdc_telemetry_fields_watch(&fields[0], - fields.size()); - } + if (not_supported) { + if (!notifications_->is_notification_event(it->second)) { + unsupported_fields++; + } + it = fields_in_watch.erase(it); + } else { + it++; + } + } // end for + } // end if + } + if (unsupported_fields > 0) { + RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields << " fields as they are not supported."); + } - return RDC_ST_OK; + // Update the fields_to_watch_ + auto f_in_watch_iter = fields_in_watch.begin(); + + for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) { + auto ite = fields_to_watch_.find(*f_in_watch_iter); + if (ite == fields_to_watch_.end()) { // A new field + fields_to_watch_.insert({*f_in_watch_iter, f}); + } else { // Merge the settings + auto& f_in_table = ite->second; + f_in_table.max_keep_age = std::max(f_in_table.max_keep_age, max_keep_age); + f_in_table.max_keep_samples = std::max(f_in_table.max_keep_samples, max_keep_samples); + if (f_in_table.is_watching) { // Already watching + f_in_table.update_freq = std::min(f_in_table.update_freq, update_freq); + } else { // Not watching before + f_in_table.is_watching = true; + f_in_table.update_freq = update_freq; + } + } + } + + // Add to the watch table + watch_table_.insert({gkey, f}); + + // Notify the telemetry_module to watch those fields + if (rdc_telemetry) { + std::vector fields; + auto fields_to_watch_iter = fields_to_watch_.begin(); + for (; fields_to_watch_iter != fields_to_watch_.end(); fields_to_watch_iter++) { + if (fields_to_watch_iter->second.is_watching) { + fields.push_back({fields_to_watch_iter->first.first, fields_to_watch_iter->first.second}); + } + } + rdc_telemetry->rdc_telemetry_fields_watch(&fields[0], fields.size()); + } + + return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch( - const RdcFieldGroupKey& entry) { - // Get individual fields for this unwatch - std::vector fields; - rdc_status_t result = get_fields_from_group( - entry.first, entry.second, fields); +rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch(const RdcFieldGroupKey& entry) { + // Get individual fields for this unwatch + std::vector fields; + rdc_status_t result = get_fields_from_group(entry.first, entry.second, fields); + if (result != RDC_ST_OK) { + return result; + } + + // Unwatch will only impact the update_freq, but not the max_keep_age + // and max_keep_samples. Walk through watch_table_ to get new update + // frequency for all fields and store it in update_frequencies + std::map update_frequencies; + auto w_iter = watch_table_.begin(); + for (; w_iter != watch_table_.end(); w_iter++) { + // Skip the table is not in watching status + if (w_iter->second.is_watching == false) { + continue; + } + + // Get all fields in current table + std::vector watch_fields; + result = get_fields_from_group(w_iter->first.first, w_iter->first.second, watch_fields); if (result != RDC_ST_OK) { - return result; + return result; } - // Unwatch will only impact the update_freq, but not the max_keep_age - // and max_keep_samples. Walk through watch_table_ to get new update - // frequency for all fields and store it in update_frequencies - std::map update_frequencies; - auto w_iter = watch_table_.begin(); - for (; w_iter != watch_table_.end(); w_iter++) { - // Skip the table is not in watching status - if (w_iter->second.is_watching == false) { - continue; - } + // Get the update_freq + auto fields_in_table_iter = watch_fields.begin(); + for (; fields_in_table_iter != watch_fields.end(); fields_in_table_iter++) { + auto f_in_freq_iter = update_frequencies.find(*fields_in_table_iter); + if (f_in_freq_iter == update_frequencies.end()) { + update_frequencies.insert({*fields_in_table_iter, w_iter->second.update_freq}); + } else { + f_in_freq_iter->second = std::min(f_in_freq_iter->second, w_iter->second.update_freq); + } + } + } - // Get all fields in current table - std::vector watch_fields; - result = get_fields_from_group(w_iter->first.first, - w_iter->first.second, watch_fields); - if (result != RDC_ST_OK) { - return result; - } - - // Get the update_freq - auto fields_in_table_iter = watch_fields.begin(); - for (; fields_in_table_iter != watch_fields.end(); - fields_in_table_iter++) { - auto f_in_freq_iter = update_frequencies.find( - *fields_in_table_iter); - if (f_in_freq_iter == update_frequencies.end()) { - update_frequencies.insert( - {*fields_in_table_iter, w_iter->second.update_freq}); - } else { - f_in_freq_iter->second = - std::min(f_in_freq_iter->second, - w_iter->second.update_freq); - } - } + // Update the fields that impacted by this unwatch + auto fite = fields.begin(); + std::vector unwatch_fields; + for (; fite != fields.end(); fite++) { + // Turn off any notification fields + if (notifications_->is_notification_event(fite->second)) { + notifications_->stop_listening(fite->first); + continue; } - // Update the fields that impacted by this unwatch - auto fite = fields.begin(); - std::vector unwatch_fields; - for (; fite != fields.end(); fite++) { - // Turn off any notification fields - if (notifications_->is_notification_event(fite->second)) { - notifications_->stop_listening(fite->first); - continue; - } - - auto f_in_table = fields_to_watch_.find((*fite)); - if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_ - unwatch_fields.push_back({fite->first, fite->second}); - continue; - } - - auto freq_iter = update_frequencies.find(*fite); - if (freq_iter == update_frequencies.end()) { - f_in_table->second.is_watching = false; - unwatch_fields.push_back({fite->first, fite->second}); - } else { - f_in_table->second.update_freq = freq_iter->second; - } + auto f_in_table = fields_to_watch_.find((*fite)); + if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_ + unwatch_fields.push_back({fite->first, fite->second}); + continue; } - // Notify the telemetry_module to unwatch those fields - auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); - if (rdc_telemetry) { - rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0], - unwatch_fields.size()); + auto freq_iter = update_frequencies.find(*fite); + if (freq_iter == update_frequencies.end()) { + f_in_table->second.is_watching = false; + unwatch_fields.push_back({fite->first, fite->second}); + } else { + f_in_table->second.update_freq = freq_iter->second; } + } - return RDC_ST_OK; + // Notify the telemetry_module to unwatch those fields + auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); + if (rdc_telemetry) { + rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0], unwatch_fields.size()); + } + + return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::rdc_field_unwatch( - rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; +rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(rdc_gpu_group_t group_id, + rdc_field_grp_t field_group_id) { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; - std::lock_guard guard(watch_mutex_); - // Set is_watching = false - auto ite = watch_table_.find(RdcFieldGroupKey({group_id, field_group_id})); - if (ite == watch_table_.end()) { - return RDC_ST_NOT_FOUND; - } - ite->second.is_watching = false; - ite->second.last_update_time = now; + std::lock_guard guard(watch_mutex_); + // Set is_watching = false + auto ite = watch_table_.find(RdcFieldGroupKey({group_id, field_group_id})); + if (ite == watch_table_.end()) { + return RDC_ST_NOT_FOUND; + } + ite->second.is_watching = false; + ite->second.last_update_time = now; - // Update the fields_to_watch_ - return update_field_in_table_when_unwatch(ite->first); + // Update the fields_to_watch_ + return update_field_in_table_when_unwatch(ite->first); } -bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, - rdc_field_t field_id, std::string& job_id) const { - RdcFieldKey key{gpu_index, field_id}; +bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id, + std::string& job_id) const { + RdcFieldKey key{gpu_index, field_id}; - for (auto ite = job_watch_table_.begin(); - ite != job_watch_table_.end(); ite++) { - auto& fields = ite->second.fields; - if (std::find(fields.begin(), fields.end(), key) != fields.end()) { - job_id = ite->first; - return true; - } + for (auto ite = job_watch_table_.begin(); ite != job_watch_table_.end(); ite++) { + auto& fields = ite->second.fields; + if (std::find(fields.begin(), fields.end(), key) != fields.end()) { + job_id = ite->first; + return true; } + } - return false; + return false; } -rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, - uint32_t num_values, void* user_data) { - if (values == nullptr || user_data == nullptr) { - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values, + void* user_data) { + if (values == nullptr || user_data == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + RdcWatchTableImpl* watchTable = static_cast(user_data); + + for (uint32_t i = 0; i < num_values; i++) { + auto gpu_index = values[i].gpu_index; + auto field_id = values[i].field_value.field_id; + + // Always Update the timestamp + auto ite = watchTable->fields_to_watch_.find({gpu_index, field_id}); + if (ite != watchTable->fields_to_watch_.end()) { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + ite->second.last_update_time = now; } - RdcWatchTableImpl* watchTable = static_cast(user_data); - for (uint32_t i = 0; i < num_values; i++) { - auto gpu_index = values[i].gpu_index; - auto field_id = values[i].field_value.field_id; - - // Always Update the timestamp - auto ite = watchTable->fields_to_watch_.find({gpu_index, field_id}); - if (ite != watchTable->fields_to_watch_.end()) { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t now = static_cast(tv.tv_sec) * 1000 - + tv.tv_usec / 1000; - ite->second.last_update_time = now; - } - - // Only cache valid results - if (values[i].field_value.status != RDC_ST_OK) { - continue; - } - - // Update the cache - watchTable->cache_mgr_->rdc_update_cache(gpu_index, - values[i].field_value); - - // Update the job stats cache - std::string job_id; - if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) { - watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, - job_id, values[i].field_value); - } + // Only cache valid results + if (values[i].field_value.status != RDC_ST_OK) { + continue; } - return RDC_ST_OK; + + // Update the cache + watchTable->cache_mgr_->rdc_update_cache(gpu_index, values[i].field_value); + + // Update the job stats cache + std::string job_id; + if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) { + watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, job_id, values[i].field_value); + } + } + return RDC_ST_OK; } rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; - // Collect all fields need to be updated for bulk fetch - std::vector fields; - std::lock_guard guard(watch_mutex_); - auto fite = fields_to_watch_.begin(); - for (; fite != fields_to_watch_.end(); fite++) { - // Is this field need to be updated? - uint64_t track_freq = fite->second.update_freq/1000; - if (!fite->second.is_watching || - fite->second.last_update_time+track_freq > now) { - continue; - } - fields.push_back({fite->first.first, fite->first.second}); + // Collect all fields need to be updated for bulk fetch + std::vector fields; + std::lock_guard guard(watch_mutex_); + auto fite = fields_to_watch_.begin(); + for (; fite != fields_to_watch_.end(); fite++) { + // Is this field need to be updated? + uint64_t track_freq = fite->second.update_freq / 1000; + if (!fite->second.is_watching || fite->second.last_update_time + track_freq > now) { + continue; } + fields.push_back({fite->first.first, fite->first.second}); + } - if (fields.size() != 0) { - auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); - if (rdc_telemetry) { - rdc_telemetry->rdc_telemetry_fields_value_get(&fields[0], - fields.size(), RdcWatchTableImpl::handle_fields, this); - } else { - RDC_LOG(RDC_ERROR, - "RdcWatchTableImpl: Fail to get the telemetry module"); - } + if (fields.size() != 0) { + auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); + if (rdc_telemetry) { + rdc_telemetry->rdc_telemetry_fields_value_get(&fields[0], fields.size(), + RdcWatchTableImpl::handle_fields, this); + } else { + RDC_LOG(RDC_ERROR, "RdcWatchTableImpl: Fail to get the telemetry module"); } + } - // Clean up is expensive, only do it once per second - if (now - last_cleanup_time_ > 1000) { - clean_up(); - last_cleanup_time_ = now; - } + // Clean up is expensive, only do it once per second + if (now - last_cleanup_time_ > 1000) { + clean_up(); + last_cleanup_time_ = now; + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t -RdcWatchTableImpl::rdc_notif_update_cache( - rdc_evnt_notification_t *events, uint32_t num_events) { +rdc_status_t RdcWatchTableImpl::rdc_notif_update_cache(rdc_evnt_notification_t* events, + uint32_t num_events) { if (events == nullptr || num_events == 0) { return RDC_ST_BAD_PARAMETER; } @@ -543,83 +511,81 @@ rdc_status_t RdcWatchTableImpl::rdc_field_listen_notif(uint32_t timeout_ms) { } void RdcWatchTableImpl::clean_up() { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; - // Clean the cache and the fields_to_watch_ table - auto fite = fields_to_watch_.begin(); - while (fite != fields_to_watch_.end()) { - cache_mgr_->evict_cache(fite->first.first, fite->first.second, - fite->second.max_keep_samples, fite->second.max_keep_age); - if (!fite->second.is_watching && fite->second.last_update_time + - fite->second.max_keep_age*1000 < now ) { - fite = fields_to_watch_.erase(fite); - } else { - ++fite; - } + // Clean the cache and the fields_to_watch_ table + auto fite = fields_to_watch_.begin(); + while (fite != fields_to_watch_.end()) { + cache_mgr_->evict_cache(fite->first.first, fite->first.second, fite->second.max_keep_samples, + fite->second.max_keep_age); + if (!fite->second.is_watching && + fite->second.last_update_time + fite->second.max_keep_age * 1000 < now) { + fite = fields_to_watch_.erase(fite); + } else { + ++fite; } + } - // Clean the watch table - auto wite = watch_table_.begin(); - while (wite != watch_table_.end()) { - if (!wite->second.is_watching && wite->second.last_update_time + - wite->second.max_keep_age*1000 < now ) { - wite = watch_table_.erase(wite); - } else { - ++wite; - } + // Clean the watch table + auto wite = watch_table_.begin(); + while (wite != watch_table_.end()) { + if (!wite->second.is_watching && + wite->second.last_update_time + wite->second.max_keep_age * 1000 < now) { + wite = watch_table_.erase(wite); + } else { + ++wite; } + } - // Debug log every 30 seconds - if (now/1000%30 == 0) { - debug_status(); - } + // Debug log every 30 seconds + if (now / 1000 % 30 == 0) { + debug_status(); + } } void RdcWatchTableImpl::debug_status() { - RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size() - << " watch_table_:" << watch_table_.size() - << " job_watch_table_:" << job_watch_table_.size() - << " cache stats:" << cache_mgr_->get_cache_stats()); + RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size() + << " watch_table_:" << watch_table_.size() + << " job_watch_table_:" << job_watch_table_.size() + << " cache stats:" << cache_mgr_->get_cache_stats()); - if (watch_table_.size() > 0) { - RDC_LOG(RDC_DEBUG, "watch table details:"); - } - for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) { - RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second - << ": age:" << wite->second.max_keep_age << ", samples:" - << wite->second.max_keep_samples << ", is_watching:" - << wite->second.is_watching << ", last_update_time:" - << wite->second.last_update_time <<", update_freq:" - << wite->second.update_freq); - } + if (watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "watch table details:"); + } + for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) { + RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second + << ": age:" << wite->second.max_keep_age + << ", samples:" << wite->second.max_keep_samples + << ", is_watching:" << wite->second.is_watching + << ", last_update_time:" << wite->second.last_update_time + << ", update_freq:" << wite->second.update_freq); + } - if (job_watch_table_.size() > 0) { - RDC_LOG(RDC_DEBUG, "job watch table details: "); - } - for (auto jite = job_watch_table_.begin(); - jite !=job_watch_table_.end(); jite++) { - std::stringstream strstream; - for (const auto& p : jite->second.fields) { - strstream << "<" << p.first << "," << p.second << "> "; - } - RDC_LOG(RDC_DEBUG, jite->first << ": " << jite->second.group_id - << " fields : "<< strstream.str()); + if (job_watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "job watch table details: "); + } + for (auto jite = job_watch_table_.begin(); jite != job_watch_table_.end(); jite++) { + std::stringstream strstream; + for (const auto& p : jite->second.fields) { + strstream << "<" << p.first << "," << p.second << "> "; } + RDC_LOG(RDC_DEBUG, + jite->first << ": " << jite->second.group_id << " fields : " << strstream.str()); + } - if (fields_to_watch_.size() > 0) { - RDC_LOG(RDC_DEBUG, "fields to watch details:"); - } - for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end(); - fite++) { - RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second - << ": age:" << fite->second.max_keep_age << ", samples:" - << fite->second.max_keep_samples << ", is_watching:" - << fite->second.is_watching << ", last_update_time:" - << fite->second.last_update_time <<", update_freq:" - << fite->second.update_freq); - } + if (fields_to_watch_.size() > 0) { + RDC_LOG(RDC_DEBUG, "fields to watch details:"); + } + for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end(); fite++) { + RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second + << ": age:" << fite->second.max_keep_age + << ", samples:" << fite->second.max_keep_samples + << ", is_watching:" << fite->second.is_watching + << ", last_update_time:" << fite->second.last_update_time + << ", update_freq:" << fite->second.update_freq); + } } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc index c6dff35b47..63bb284faa 100644 --- a/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc +++ b/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc @@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "rocm_smi/rocm_smi.h" #include "rdc/rdc.h" +#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { @@ -70,4 +70,3 @@ rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) { } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 0d0c724232..99ced43026 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -20,667 +20,622 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "rdc_lib/impl/RdcStandaloneHandler.h" -#include -#include "rdc.grpc.pb.h" // NOLINT -amd::rdc::RdcHandler *make_handler(const char* ip_and_port, - const char* root_ca, const char* client_cert, const char* client_key) { - return new amd::rdc::RdcStandaloneHandler(ip_and_port, - root_ca, client_cert, client_key); +#include + +#include "rdc.grpc.pb.h" // NOLINT + +amd::rdc::RdcHandler* make_handler(const char* ip_and_port, const char* root_ca, + const char* client_cert, const char* client_key) { + return new amd::rdc::RdcStandaloneHandler(ip_and_port, root_ca, client_cert, client_key); } namespace amd { namespace rdc { -RdcStandaloneHandler::RdcStandaloneHandler(const char* ip_and_port, - const char* root_ca, const char* client_cert, const char* client_key) { - std::shared_ptr cred(nullptr); - if (root_ca == nullptr || client_cert == nullptr - || client_key == nullptr) { - cred = grpc::InsecureChannelCredentials(); - } else { - grpc::SslCredentialsOptions sslOpts{}; - sslOpts.pem_root_certs = root_ca; - sslOpts.pem_private_key = client_key; - sslOpts.pem_cert_chain = client_cert; - cred = grpc::SslCredentials(sslOpts); - } - stub_ = ::rdc::RdcAPI::NewStub(grpc::CreateChannel(ip_and_port, cred)); - } +RdcStandaloneHandler::RdcStandaloneHandler(const char* ip_and_port, const char* root_ca, + const char* client_cert, const char* client_key) { + std::shared_ptr cred(nullptr); + if (root_ca == nullptr || client_cert == nullptr || client_key == nullptr) { + cred = grpc::InsecureChannelCredentials(); + } else { + grpc::SslCredentialsOptions sslOpts{}; + sslOpts.pem_root_certs = root_ca; + sslOpts.pem_private_key = client_key; + sslOpts.pem_cert_chain = client_cert; + cred = grpc::SslCredentials(sslOpts); + } + stub_ = ::rdc::RdcAPI::NewStub(grpc::CreateChannel(ip_and_port, cred)); +} +rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status, uint32_t rdc_status) { + if (!status.ok()) { + std::cout << status.error_message() << ". Error code:" << status.error_code() << std::endl; + return RDC_ST_CLIENT_ERROR; + } -rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status, - uint32_t rdc_status) { - if (!status.ok()) { - std::cout<< status.error_message() <<". Error code:" - << status.error_code() << std::endl; - return RDC_ST_CLIENT_ERROR; - } - - return static_cast(rdc_status); + return static_cast(rdc_status); } // JOB RdcAPI -rdc_status_t RdcStandaloneHandler::rdc_job_start_stats( - rdc_gpu_group_t groupId, const char job_id[64], uint64_t update_freq) { - ::rdc::StartJobStatsRequest request; - ::rdc::StartJobStatsResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, + const char job_id[64], + uint64_t update_freq) { + ::rdc::StartJobStatsRequest request; + ::rdc::StartJobStatsResponse reply; + ::grpc::ClientContext context; - request.set_group_id(groupId); - request.set_job_id(job_id); - request.set_update_freq(update_freq); - ::grpc::Status status = stub_->StartJobStats(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); + request.set_group_id(groupId); + request.set_job_id(job_id); + request.set_update_freq(update_freq); + ::grpc::Status status = stub_->StartJobStats(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); - return err_status; + return err_status; } -bool RdcStandaloneHandler::copy_gpu_usage_info( - const ::rdc::GpuUsageInfo& src, - rdc_gpu_usage_info_t* target) { - if (target == nullptr) { - return false; - } +bool RdcStandaloneHandler::copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, + rdc_gpu_usage_info_t* target) { + if (target == nullptr) { + return false; + } - target->gpu_id = src.gpu_id(); - target->start_time = src.start_time(); - target->end_time = src.end_time(); - target->energy_consumed = src.energy_consumed(); - target->max_gpu_memory_used = src.max_gpu_memory_used(); - target->ecc_correct = src.ecc_correct(); - target->ecc_uncorrect = src.ecc_uncorrect(); + target->gpu_id = src.gpu_id(); + target->start_time = src.start_time(); + target->end_time = src.end_time(); + target->energy_consumed = src.energy_consumed(); + target->max_gpu_memory_used = src.max_gpu_memory_used(); + target->ecc_correct = src.ecc_correct(); + target->ecc_uncorrect = src.ecc_uncorrect(); - const ::rdc::JobStatsSummary& pstats = src.power_usage(); - target->power_usage.max_value = pstats.max_value(); - target->power_usage.min_value = pstats.min_value(); - target->power_usage.average = pstats.average(); - target->power_usage.standard_deviation = pstats.standard_deviation(); + const ::rdc::JobStatsSummary& pstats = src.power_usage(); + target->power_usage.max_value = pstats.max_value(); + target->power_usage.min_value = pstats.min_value(); + target->power_usage.average = pstats.average(); + target->power_usage.standard_deviation = pstats.standard_deviation(); - const ::rdc::JobStatsSummary& cstats = src.gpu_clock(); - target->gpu_clock.max_value = cstats.max_value(); - target->gpu_clock.min_value = cstats.min_value(); - target->gpu_clock.average = cstats.average(); - target->gpu_clock.standard_deviation = cstats.standard_deviation(); + const ::rdc::JobStatsSummary& cstats = src.gpu_clock(); + target->gpu_clock.max_value = cstats.max_value(); + target->gpu_clock.min_value = cstats.min_value(); + target->gpu_clock.average = cstats.average(); + target->gpu_clock.standard_deviation = cstats.standard_deviation(); - const ::rdc::JobStatsSummary& ustats = src.gpu_utilization(); - target->gpu_utilization.max_value = ustats.max_value(); - target->gpu_utilization.min_value = ustats.min_value(); - target->gpu_utilization.average = ustats.average(); - target->gpu_utilization.standard_deviation = ustats.standard_deviation(); + const ::rdc::JobStatsSummary& ustats = src.gpu_utilization(); + target->gpu_utilization.max_value = ustats.max_value(); + target->gpu_utilization.min_value = ustats.min_value(); + target->gpu_utilization.average = ustats.average(); + target->gpu_utilization.standard_deviation = ustats.standard_deviation(); - const ::rdc::JobStatsSummary& mstats = src.memory_utilization(); - target->memory_utilization.max_value = mstats.max_value(); - target->memory_utilization.min_value = mstats.min_value(); - target->memory_utilization.average = mstats.average(); - target->memory_utilization.standard_deviation = mstats.standard_deviation(); + const ::rdc::JobStatsSummary& mstats = src.memory_utilization(); + target->memory_utilization.max_value = mstats.max_value(); + target->memory_utilization.min_value = mstats.min_value(); + target->memory_utilization.average = mstats.average(); + target->memory_utilization.standard_deviation = mstats.standard_deviation(); - const ::rdc::JobStatsSummary& txstats = src.pcie_tx(); - target->pcie_tx.max_value = txstats.max_value(); - target->pcie_tx.min_value = txstats.min_value(); - target->pcie_tx.average = txstats.average(); - target->pcie_tx.standard_deviation = txstats.standard_deviation(); + const ::rdc::JobStatsSummary& txstats = src.pcie_tx(); + target->pcie_tx.max_value = txstats.max_value(); + target->pcie_tx.min_value = txstats.min_value(); + target->pcie_tx.average = txstats.average(); + target->pcie_tx.standard_deviation = txstats.standard_deviation(); - const ::rdc::JobStatsSummary& rxstats = src.pcie_rx(); - target->pcie_rx.max_value = rxstats.max_value(); - target->pcie_rx.min_value = rxstats.min_value(); - target->pcie_rx.average = rxstats.average(); - target->pcie_rx.standard_deviation = rxstats.standard_deviation(); + const ::rdc::JobStatsSummary& rxstats = src.pcie_rx(); + target->pcie_rx.max_value = rxstats.max_value(); + target->pcie_rx.min_value = rxstats.min_value(); + target->pcie_rx.average = rxstats.average(); + target->pcie_rx.standard_deviation = rxstats.standard_deviation(); - const ::rdc::JobStatsSummary& mcstats = src.memory_clock(); - target->memory_clock.max_value = mcstats.max_value(); - target->memory_clock.min_value = mcstats.min_value(); - target->memory_clock.average = mcstats.average(); - target->memory_clock.standard_deviation = mcstats.standard_deviation(); + const ::rdc::JobStatsSummary& mcstats = src.memory_clock(); + target->memory_clock.max_value = mcstats.max_value(); + target->memory_clock.min_value = mcstats.min_value(); + target->memory_clock.average = mcstats.average(); + target->memory_clock.standard_deviation = mcstats.standard_deviation(); - const ::rdc::JobStatsSummary& gtstats = src.gpu_temperature(); - target->gpu_temperature.max_value = gtstats.max_value(); - target->gpu_temperature.min_value = gtstats.min_value(); - target->gpu_temperature.average = gtstats.average(); - target->gpu_temperature.standard_deviation = gtstats.standard_deviation(); + const ::rdc::JobStatsSummary& gtstats = src.gpu_temperature(); + target->gpu_temperature.max_value = gtstats.max_value(); + target->gpu_temperature.min_value = gtstats.min_value(); + target->gpu_temperature.average = gtstats.average(); + target->gpu_temperature.standard_deviation = gtstats.standard_deviation(); - return true; + return true; } rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(const char job_id[64], - rdc_job_info_t* p_job_info) { - if (!p_job_info) { - return RDC_ST_BAD_PARAMETER; - } + rdc_job_info_t* p_job_info) { + if (!p_job_info) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::GetJobStatsRequest request; - ::rdc::GetJobStatsResponse reply; - ::grpc::ClientContext context; + ::rdc::GetJobStatsRequest request; + ::rdc::GetJobStatsResponse reply; + ::grpc::ClientContext context; - request.set_job_id(job_id); - ::grpc::Status status = stub_->GetJobStats(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_job_id(job_id); + ::grpc::Status status = stub_->GetJobStats(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - p_job_info->num_gpus = reply.num_gpus(); - copy_gpu_usage_info(reply.summary(), &(p_job_info->summary)); - for (int i = 0; i < reply.gpus_size(); i++) { - copy_gpu_usage_info(reply.gpus(i), &(p_job_info->gpus[i])); - } + p_job_info->num_gpus = reply.num_gpus(); + copy_gpu_usage_info(reply.summary(), &(p_job_info->summary)); + for (int i = 0; i < reply.gpus_size(); i++) { + copy_gpu_usage_info(reply.gpus(i), &(p_job_info->gpus[i])); + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(const char job_id[64]) { - ::rdc::StopJobStatsRequest request; - ::rdc::StopJobStatsResponse reply; - ::grpc::ClientContext context; + ::rdc::StopJobStatsRequest request; + ::rdc::StopJobStatsResponse reply; + ::grpc::ClientContext context; - request.set_job_id(job_id); - ::grpc::Status status = stub_->StopJobStats(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); + request.set_job_id(job_id); + ::grpc::Status status = stub_->StopJobStats(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); - return err_status; + return err_status; } rdc_status_t RdcStandaloneHandler::rdc_job_remove(const char job_id[64]) { - ::rdc::RemoveJobRequest request; - ::rdc::RemoveJobResponse reply; - ::grpc::ClientContext context; + ::rdc::RemoveJobRequest request; + ::rdc::RemoveJobResponse reply; + ::grpc::ClientContext context; - request.set_job_id(job_id); - ::grpc::Status status = stub_->RemoveJob(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); + request.set_job_id(job_id); + ::grpc::Status status = stub_->RemoveJob(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); - return err_status; + return err_status; } rdc_status_t RdcStandaloneHandler::rdc_job_remove_all() { - ::rdc::Empty request; - ::rdc::RemoveAllJobResponse reply; - ::grpc::ClientContext context; + ::rdc::Empty request; + ::rdc::RemoveAllJobResponse reply; + ::grpc::ClientContext context; - ::grpc::Status status = stub_->RemoveAllJob(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); + ::grpc::Status status = stub_->RemoveAllJob(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); - return err_status; + return err_status; } // Discovery RdcAPI -rdc_status_t RdcStandaloneHandler::rdc_device_get_all( - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; - } - ::rdc::Empty request; - ::rdc::GetAllDevicesResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::Empty request; + ::rdc::GetAllDevicesResponse reply; + ::grpc::ClientContext context; - ::grpc::Status status = stub_->GetAllDevices(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + ::grpc::Status status = stub_->GetAllDevices(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - if (reply.gpus_size() > RDC_MAX_NUM_DEVICES) { - return RDC_ST_BAD_PARAMETER; - } + if (reply.gpus_size() > RDC_MAX_NUM_DEVICES) { + return RDC_ST_BAD_PARAMETER; + } - *count = reply.gpus_size(); - for (uint32_t i =0 ; i < *count; i++) { - gpu_index_list[i] = reply.gpus(i); - } + *count = reply.gpus_size(); + for (uint32_t i = 0; i < *count; i++) { + gpu_index_list[i] = reply.gpus(i); + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcStandaloneHandler::rdc_device_get_attributes(uint32_t gpu_index, - rdc_device_attributes_t* p_rdc_attr) { - if (!p_rdc_attr) { - return RDC_ST_BAD_PARAMETER; - } - ::rdc::GetDeviceAttributesRequest request; - ::rdc::GetDeviceAttributesResponse reply; - ::grpc::ClientContext context; + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_attr) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::GetDeviceAttributesRequest request; + ::rdc::GetDeviceAttributesResponse reply; + ::grpc::ClientContext context; - request.set_gpu_index(gpu_index); - ::grpc::Status status = stub_-> - GetDeviceAttributes(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_gpu_index(gpu_index); + ::grpc::Status status = stub_->GetDeviceAttributes(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - strncpy_with_null(p_rdc_attr->device_name, - reply.attributes().device_name().c_str(), RDC_MAX_STR_LENGTH); + strncpy_with_null(p_rdc_attr->device_name, reply.attributes().device_name().c_str(), + RDC_MAX_STR_LENGTH); - - return RDC_ST_OK; + return RDC_ST_OK; } - // Group RdcAPI rdc_status_t RdcStandaloneHandler::rdc_group_gpu_create(rdc_group_type_t type, - const char* group_name, - rdc_gpu_group_t* p_rdc_group_id) { - if (!group_name || !p_rdc_group_id) { - return RDC_ST_BAD_PARAMETER; - } + const char* group_name, + rdc_gpu_group_t* p_rdc_group_id) { + if (!group_name || !p_rdc_group_id) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::CreateGpuGroupRequest request; - ::rdc::CreateGpuGroupResponse reply; - ::grpc::ClientContext context; + ::rdc::CreateGpuGroupRequest request; + ::rdc::CreateGpuGroupResponse reply; + ::grpc::ClientContext context; - request.set_type( - static_cast<::rdc::CreateGpuGroupRequest_GpuGroupType>(type)); - request.set_group_name(group_name); - ::grpc::Status status = stub_-> - CreateGpuGroup(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_type(static_cast<::rdc::CreateGpuGroupRequest_GpuGroupType>(type)); + request.set_group_name(group_name); + ::grpc::Status status = stub_->CreateGpuGroup(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - *p_rdc_group_id = reply.group_id(); + *p_rdc_group_id = reply.group_id(); - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, - uint32_t gpu_index) { - ::rdc::AddToGpuGroupRequest request; - ::rdc::AddToGpuGroupResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, uint32_t gpu_index) { + ::rdc::AddToGpuGroupRequest request; + ::rdc::AddToGpuGroupResponse reply; + ::grpc::ClientContext context; - request.set_group_id(group_id); - request.set_gpu_index(gpu_index); - ::grpc::Status status = stub_-> - AddToGpuGroup(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); + request.set_group_id(group_id); + request.set_gpu_index(gpu_index); + ::grpc::Status status = stub_->AddToGpuGroup(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); - return err_status; + return err_status; } -rdc_status_t RdcStandaloneHandler::rdc_group_field_create( - uint32_t num_field_ids, rdc_field_t* field_ids, - const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) { - if (!field_ids || !field_group_name || !rdc_field_group_id) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcStandaloneHandler::rdc_group_field_create(uint32_t num_field_ids, + rdc_field_t* field_ids, + const char* field_group_name, + rdc_field_grp_t* rdc_field_group_id) { + if (!field_ids || !field_group_name || !rdc_field_group_id) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::CreateFieldGroupRequest request; - ::rdc::CreateFieldGroupResponse reply; - ::grpc::ClientContext context; + ::rdc::CreateFieldGroupRequest request; + ::rdc::CreateFieldGroupResponse reply; + ::grpc::ClientContext context; - request.set_field_group_name(field_group_name); - for (uint32_t i = 0; i < num_field_ids; i++){ - request.add_field_ids(field_ids[i]); - } + request.set_field_group_name(field_group_name); + for (uint32_t i = 0; i < num_field_ids; i++) { + request.add_field_ids(field_ids[i]); + } - ::grpc::Status status = stub_-> - CreateFieldGroup(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; - *rdc_field_group_id = reply.field_group_id(); + ::grpc::Status status = stub_->CreateFieldGroup(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + *rdc_field_group_id = reply.field_group_id(); - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcStandaloneHandler::rdc_group_field_get_info( - rdc_field_grp_t rdc_field_group_id, - rdc_field_group_info_t* field_group_info) { - if (!field_group_info) { - return RDC_ST_BAD_PARAMETER; - } + rdc_field_grp_t rdc_field_group_id, rdc_field_group_info_t* field_group_info) { + if (!field_group_info) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::GetFieldGroupInfoRequest request; - ::rdc::GetFieldGroupInfoResponse reply; - ::grpc::ClientContext context; + ::rdc::GetFieldGroupInfoRequest request; + ::rdc::GetFieldGroupInfoResponse reply; + ::grpc::ClientContext context; - request.set_field_group_id(rdc_field_group_id); - ::grpc::Status status = stub_-> - GetFieldGroupInfo(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_field_group_id(rdc_field_group_id); + ::grpc::Status status = stub_->GetFieldGroupInfo(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - if (reply.field_ids_size() > RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { - return RDC_ST_MAX_LIMIT; - } + if (reply.field_ids_size() > RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { + return RDC_ST_MAX_LIMIT; + } - field_group_info->count = reply.field_ids_size(); - strncpy_with_null(field_group_info->group_name, - reply.filed_group_name().c_str(), RDC_MAX_STR_LENGTH); - for (int i = 0; i < reply.field_ids_size(); i++) { - field_group_info->field_ids[i] = - static_cast(reply.field_ids(i)); - } + field_group_info->count = reply.field_ids_size(); + strncpy_with_null(field_group_info->group_name, reply.filed_group_name().c_str(), + RDC_MAX_STR_LENGTH); + for (int i = 0; i < reply.field_ids_size(); i++) { + field_group_info->field_ids[i] = static_cast(reply.field_ids(i)); + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_group_gpu_get_info( - rdc_gpu_group_t p_rdc_group_id, - rdc_group_info_t* p_rdc_group_info) { - if (!p_rdc_group_info) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcStandaloneHandler::rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id, + rdc_group_info_t* p_rdc_group_info) { + if (!p_rdc_group_info) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::GetGpuGroupInfoRequest request; - ::rdc::GetGpuGroupInfoResponse reply; - ::grpc::ClientContext context; + ::rdc::GetGpuGroupInfoRequest request; + ::rdc::GetGpuGroupInfoResponse reply; + ::grpc::ClientContext context; - request.set_group_id(p_rdc_group_id); - ::grpc::Status status = stub_-> - GetGpuGroupInfo(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_group_id(p_rdc_group_id); + ::grpc::Status status = stub_->GetGpuGroupInfo(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - if (reply.entity_ids_size() > RDC_GROUP_MAX_ENTITIES) { - return RDC_ST_MAX_LIMIT; - } + if (reply.entity_ids_size() > RDC_GROUP_MAX_ENTITIES) { + return RDC_ST_MAX_LIMIT; + } - p_rdc_group_info->count = reply.entity_ids_size(); - strncpy_with_null(p_rdc_group_info->group_name, - reply.group_name().c_str(), RDC_MAX_STR_LENGTH); - for (int i = 0; i < reply.entity_ids_size(); i++) { - p_rdc_group_info->entity_ids[i] = reply.entity_ids(i); - } + p_rdc_group_info->count = reply.entity_ids_size(); + strncpy_with_null(p_rdc_group_info->group_name, reply.group_name().c_str(), RDC_MAX_STR_LENGTH); + for (int i = 0; i < reply.entity_ids_size(); i++) { + p_rdc_group_info->entity_ids[i] = reply.entity_ids(i); + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_group_get_all_ids( - rdc_gpu_group_t group_id_list[], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; - } - ::rdc::Empty request; - ::rdc::GetGroupAllIdsResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::Empty request; + ::rdc::GetGroupAllIdsResponse reply; + ::grpc::ClientContext context; - ::grpc::Status status = stub_-> - GetGroupAllIds(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + ::grpc::Status status = stub_->GetGroupAllIds(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - *count = reply.group_ids_size(); - if (*count >= RDC_MAX_NUM_GROUPS) { - return RDC_ST_MAX_LIMIT; - } - for (uint32_t i =0 ; i < *count; i++) { - group_id_list[i] = reply.group_ids(i); - } + *count = reply.group_ids_size(); + if (*count >= RDC_MAX_NUM_GROUPS) { + return RDC_ST_MAX_LIMIT; + } + for (uint32_t i = 0; i < *count; i++) { + group_id_list[i] = reply.group_ids(i); + } - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcStandaloneHandler::rdc_group_field_get_all_ids( rdc_field_grp_t field_group_id_list[], uint32_t* count) { - if (!count) { - return RDC_ST_BAD_PARAMETER; - } + if (!count) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::Empty request; - ::rdc::GetFieldGroupAllIdsResponse reply; - ::grpc::ClientContext context; + ::rdc::Empty request; + ::rdc::GetFieldGroupAllIdsResponse reply; + ::grpc::ClientContext context; - ::grpc::Status status = stub_-> - GetFieldGroupAllIds(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + ::grpc::Status status = stub_->GetFieldGroupAllIds(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - *count = reply.field_group_ids_size(); - if (*count >= RDC_MAX_NUM_FIELD_GROUPS) { - return RDC_ST_MAX_LIMIT; - } - for (uint32_t i =0 ; i < *count; i++) { - field_group_id_list[i] = reply.field_group_ids(i); - } + *count = reply.field_group_ids_size(); + if (*count >= RDC_MAX_NUM_FIELD_GROUPS) { + return RDC_ST_MAX_LIMIT; + } + for (uint32_t i = 0; i < *count; i++) { + field_group_id_list[i] = reply.field_group_ids(i); + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_group_gpu_destroy( - rdc_gpu_group_t p_rdc_group_id) { - ::rdc::DestroyGpuGroupRequest request; - ::rdc::DestroyGpuGroupResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) { + ::rdc::DestroyGpuGroupRequest request; + ::rdc::DestroyGpuGroupResponse reply; + ::grpc::ClientContext context; - request.set_group_id(p_rdc_group_id); - ::grpc::Status status = stub_-> - DestroyGpuGroup(&context, request, &reply); - return error_handle(status, reply.status()); + request.set_group_id(p_rdc_group_id); + ::grpc::Status status = stub_->DestroyGpuGroup(&context, request, &reply); + return error_handle(status, reply.status()); } -rdc_status_t RdcStandaloneHandler::rdc_group_field_destroy( - rdc_field_grp_t rdc_field_group_id) { - ::rdc::DestroyFieldGroupRequest request; - ::rdc::DestroyFieldGroupResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) { + ::rdc::DestroyFieldGroupRequest request; + ::rdc::DestroyFieldGroupResponse reply; + ::grpc::ClientContext context; - request.set_field_group_id(rdc_field_group_id); - ::grpc::Status status = stub_-> - DestroyFieldGroup(&context, request, &reply); - return error_handle(status, reply.status()); + request.set_field_group_id(rdc_field_group_id); + ::grpc::Status status = stub_->DestroyFieldGroup(&context, request, &reply); + return error_handle(status, reply.status()); } // Field RdcAPI rdc_status_t RdcStandaloneHandler::rdc_field_watch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples) { - ::rdc::WatchFieldsRequest request; - ::rdc::WatchFieldsResponse reply; - ::grpc::ClientContext context; + rdc_field_grp_t field_group_id, + uint64_t update_freq, double max_keep_age, + uint32_t max_keep_samples) { + ::rdc::WatchFieldsRequest request; + ::rdc::WatchFieldsResponse reply; + ::grpc::ClientContext context; - request.set_group_id(group_id); - request.set_field_group_id(field_group_id); - request.set_update_freq(update_freq); - request.set_max_keep_age(max_keep_age); - request.set_max_keep_samples(max_keep_samples); - ::grpc::Status status = stub_-> - WatchFields(&context, request, &reply); + request.set_group_id(group_id); + request.set_field_group_id(field_group_id); + request.set_update_freq(update_freq); + request.set_max_keep_age(max_keep_age); + request.set_max_keep_samples(max_keep_samples); + ::grpc::Status status = stub_->WatchFields(&context, request, &reply); - return error_handle(status, reply.status()); + return error_handle(status, reply.status()); } -rdc_status_t RdcStandaloneHandler::rdc_field_get_latest_value( - uint32_t gpu_index, rdc_field_t field, rdc_field_value* value) { - if (!value) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcStandaloneHandler::rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, + rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::GetLatestFieldValueRequest request; - ::rdc::GetLatestFieldValueResponse reply; - ::grpc::ClientContext context; + ::rdc::GetLatestFieldValueRequest request; + ::rdc::GetLatestFieldValueResponse reply; + ::grpc::ClientContext context; - request.set_gpu_index(gpu_index); - request.set_field_id(field); - ::grpc::Status status = stub_-> - GetLatestFieldValue(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_gpu_index(gpu_index); + request.set_field_id(field); + ::grpc::Status status = stub_->GetLatestFieldValue(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - value->field_id = static_cast(reply.field_id()); - value->status = reply.rdc_status(); - value->ts = reply.ts(); - value->type = static_cast(reply.type()); - if (value->type == INTEGER) { - value->value.l_int = reply.l_int(); - } else if (value->type == DOUBLE) { - value->value.dbl = reply.dbl(); - } else if (value->type == STRING || value->type == BLOB) { - strncpy_with_null(value->value.str, - reply.str().c_str(), RDC_MAX_STR_LENGTH); - } + value->field_id = static_cast(reply.field_id()); + value->status = reply.rdc_status(); + value->ts = reply.ts(); + value->type = static_cast(reply.type()); + if (value->type == INTEGER) { + value->value.l_int = reply.l_int(); + } else if (value->type == DOUBLE) { + value->value.dbl = reply.dbl(); + } else if (value->type == STRING || value->type == BLOB) { + strncpy_with_null(value->value.str, reply.str().c_str(), RDC_MAX_STR_LENGTH); + } - return RDC_ST_OK; + return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_field_get_value_since(uint32_t gpu_index, - rdc_field_t field, uint64_t since_time_stamp, - uint64_t *next_since_time_stamp, rdc_field_value* value) { - if (!next_since_time_stamp || !value) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t RdcStandaloneHandler::rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field, + uint64_t since_time_stamp, + uint64_t* next_since_time_stamp, + rdc_field_value* value) { + if (!next_since_time_stamp || !value) { + return RDC_ST_BAD_PARAMETER; + } - ::rdc::GetFieldSinceRequest request; - ::rdc::GetFieldSinceResponse reply; - ::grpc::ClientContext context; + ::rdc::GetFieldSinceRequest request; + ::rdc::GetFieldSinceResponse reply; + ::grpc::ClientContext context; - request.set_gpu_index(gpu_index); - request.set_field_id(field); - request.set_since_time_stamp(since_time_stamp); - ::grpc::Status status = stub_-> - GetFieldSince(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; + request.set_gpu_index(gpu_index); + request.set_field_id(field); + request.set_since_time_stamp(since_time_stamp); + ::grpc::Status status = stub_->GetFieldSince(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; - value->field_id = static_cast(reply.field_id()); - value->status = reply.rdc_status(); - value->ts = reply.ts(); - value->type = static_cast(reply.type()); - if (value->type == INTEGER) { - value->value.l_int = reply.l_int(); - } else if (value->type == DOUBLE) { - value->value.dbl = reply.dbl(); - } else if (value->type == STRING || value->type == BLOB) { - strncpy_with_null(value->value.str, - reply.str().c_str(), RDC_MAX_STR_LENGTH); - } - *next_since_time_stamp = reply.next_since_time_stamp(); + value->field_id = static_cast(reply.field_id()); + value->status = reply.rdc_status(); + value->ts = reply.ts(); + value->type = static_cast(reply.type()); + if (value->type == INTEGER) { + value->value.l_int = reply.l_int(); + } else if (value->type == DOUBLE) { + value->value.dbl = reply.dbl(); + } else if (value->type == STRING || value->type == BLOB) { + strncpy_with_null(value->value.str, reply.str().c_str(), RDC_MAX_STR_LENGTH); + } + *next_since_time_stamp = reply.next_since_time_stamp(); - return RDC_ST_OK; + return RDC_ST_OK; } rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id) { - ::rdc::UnWatchFieldsRequest request; - ::rdc::UnWatchFieldsResponse reply; - ::grpc::ClientContext context; + rdc_field_grp_t field_group_id) { + ::rdc::UnWatchFieldsRequest request; + ::rdc::UnWatchFieldsResponse reply; + ::grpc::ClientContext context; - request.set_group_id(group_id); - request.set_field_group_id(field_group_id); - ::grpc::Status status = stub_-> - UnWatchFields(&context, request, &reply); + request.set_group_id(group_id); + request.set_field_group_id(field_group_id); + ::grpc::Status status = stub_->UnWatchFields(&context, request, &reply); - return error_handle(status, reply.status()); + return error_handle(status, reply.status()); } // Diagnostic API -rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run( - rdc_gpu_group_t group_id, - rdc_diag_level_t level, - rdc_diag_response_t* response) { - if (!response) { - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (!response) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::DiagnosticRunRequest request; + ::rdc::DiagnosticRunResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + request.set_level(level); + + ::grpc::Status status = stub_->DiagnosticRun(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + auto res = reply.response(); + response->results_count = res.results_count(); + + if (res.diag_info_size() > static_cast(MAX_TEST_CASES)) { + return RDC_ST_BAD_PARAMETER; + } + for (int i = 0; i < res.diag_info_size(); i++) { + const ::rdc::DiagnosticTestResult& result = res.diag_info(i); + rdc_diag_test_result_t& to_result = response->diag_info[i]; + to_result.status = static_cast(result.status()); + + // Set details + to_result.details.code = result.details().code(); + strncpy_with_null(to_result.details.msg, result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH); + + to_result.test_case = static_cast(result.test_case()); + to_result.per_gpu_result_count = result.per_gpu_result_count(); + + // Set Result details + if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) { + return RDC_ST_BAD_PARAMETER; } - ::rdc::DiagnosticRunRequest request; - ::rdc::DiagnosticRunResponse reply; - ::grpc::ClientContext context; - - request.set_group_id(group_id); - request.set_level(level); - - ::grpc::Status status = stub_-> - DiagnosticRun(&context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) return err_status; - auto res = reply.response(); - response->results_count = res.results_count(); - - if (res.diag_info_size() > static_cast(MAX_TEST_CASES)) { - return RDC_ST_BAD_PARAMETER; - } - for (int i = 0; i < res.diag_info_size(); i++) { - const ::rdc::DiagnosticTestResult& result = res.diag_info(i); - rdc_diag_test_result_t& to_result = response->diag_info[i]; - to_result.status = static_cast(result.status()); - - // Set details - to_result.details.code = result.details().code(); - strncpy_with_null(to_result.details.msg, - result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH); - - to_result.test_case = static_cast( - result.test_case()); - to_result.per_gpu_result_count = result.per_gpu_result_count(); - - // Set Result details - if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) { - return RDC_ST_BAD_PARAMETER; - } - for (int j=0; j < result.gpu_results_size(); j++) { - auto per_gpu_result = result.gpu_results(j); - rdc_diag_per_gpu_result_t& to_per_gpu = to_result.gpu_results[j]; - to_per_gpu.gpu_index = per_gpu_result.gpu_index(); - to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code(); - strncpy_with_null(to_per_gpu.gpu_result.msg, - per_gpu_result.gpu_result().msg().c_str(), MAX_DIAG_MSG_LENGTH); - } - strncpy_with_null(to_result.info, - result.info().c_str(), MAX_DIAG_MSG_LENGTH); - } - - return RDC_ST_OK; -} - -rdc_status_t RdcStandaloneHandler::rdc_test_case_run( - rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, - rdc_diag_test_result_t *to_result) { - if (!to_result) { - return RDC_ST_BAD_PARAMETER; - } - ::rdc::DiagnosticTestCaseRunRequest request; - ::rdc::DiagnosticTestCaseRunResponse reply; - ::grpc::ClientContext context; - - request.set_group_id(group_id); - request.set_test_case(static_cast< - ::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case)); - - ::grpc::Status status = stub_->DiagnosticTestCaseRun( - &context, request, &reply); - rdc_status_t err_status = error_handle(status, reply.status()); - if (err_status != RDC_ST_OK) - return err_status; - auto result = reply.result(); - - to_result->status = static_cast(result.status()); - - // Set details - to_result->details.code = result.details().code(); - strncpy_with_null(to_result->details.msg, - result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH); - - to_result->test_case = static_cast( - result.test_case()); - to_result->per_gpu_result_count = result.per_gpu_result_count(); - - // Set Result details - if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) { - return RDC_ST_BAD_PARAMETER; - } - for (int j = 0; j < result.gpu_results_size(); j++) { - auto per_gpu_result = result.gpu_results(j); - rdc_diag_per_gpu_result_t &to_per_gpu = - to_result->gpu_results[j]; - to_per_gpu.gpu_index = per_gpu_result.gpu_index(); - to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code(); - strncpy_with_null(to_per_gpu.gpu_result.msg, - per_gpu_result.gpu_result().msg().c_str(), + for (int j = 0; j < result.gpu_results_size(); j++) { + auto per_gpu_result = result.gpu_results(j); + rdc_diag_per_gpu_result_t& to_per_gpu = to_result.gpu_results[j]; + to_per_gpu.gpu_index = per_gpu_result.gpu_index(); + to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code(); + strncpy_with_null(to_per_gpu.gpu_result.msg, per_gpu_result.gpu_result().msg().c_str(), MAX_DIAG_MSG_LENGTH); - } - strncpy_with_null(to_result->info, - result.info().c_str(), MAX_DIAG_MSG_LENGTH); + } + strncpy_with_null(to_result.info, result.info().c_str(), MAX_DIAG_MSG_LENGTH); + } - return RDC_ST_OK; + return RDC_ST_OK; } +rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* to_result) { + if (!to_result) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::DiagnosticTestCaseRunRequest request; + ::rdc::DiagnosticTestCaseRunResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + request.set_test_case(static_cast<::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case)); + + ::grpc::Status status = stub_->DiagnosticTestCaseRun(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + auto result = reply.result(); + + to_result->status = static_cast(result.status()); + + // Set details + to_result->details.code = result.details().code(); + strncpy_with_null(to_result->details.msg, result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH); + + to_result->test_case = static_cast(result.test_case()); + to_result->per_gpu_result_count = result.per_gpu_result_count(); + + // Set Result details + if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) { + return RDC_ST_BAD_PARAMETER; + } + for (int j = 0; j < result.gpu_results_size(); j++) { + auto per_gpu_result = result.gpu_results(j); + rdc_diag_per_gpu_result_t& to_per_gpu = to_result->gpu_results[j]; + to_per_gpu.gpu_index = per_gpu_result.gpu_index(); + to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code(); + strncpy_with_null(to_per_gpu.gpu_result.msg, per_gpu_result.gpu_result().msg().c_str(), + MAX_DIAG_MSG_LENGTH); + } + strncpy_with_null(to_result->info, result.info().c_str(), MAX_DIAG_MSG_LENGTH); + + return RDC_ST_OK; +} // Control RdcAPI -rdc_status_t RdcStandaloneHandler::rdc_field_update_all( - uint32_t wait_for_update) { - ::rdc::UpdateAllFieldsRequest request; - ::rdc::UpdateAllFieldsResponse reply; - ::grpc::ClientContext context; +rdc_status_t RdcStandaloneHandler::rdc_field_update_all(uint32_t wait_for_update) { + ::rdc::UpdateAllFieldsRequest request; + ::rdc::UpdateAllFieldsResponse reply; + ::grpc::ClientContext context; - request.set_wait_for_update(wait_for_update); - ::grpc::Status status = stub_-> - UpdateAllFields(&context, request, &reply); + request.set_wait_for_update(wait_for_update); + ::grpc::Status status = stub_->UpdateAllFields(&context, request, &reply); - return error_handle(status, reply.status()); + return error_handle(status, reply.status()); } - - } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index 0704248006..4cdee36d81 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -20,164 +20,152 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_modules/rdc_rocp/RdcRocpBase.h" + #include #include #include #include #include + #include "hsa.h" #include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" -#include "rdc_modules/rdc_rocp/RdcRocpBase.h" namespace amd { namespace rdc { RdcRocpBase::RdcRocpBase() { - hsa_status_t err = hsa_init(); - if (err != HSA_STATUS_SUCCESS) { - const char* errstr = nullptr; - hsa_status_string(err, &errstr); - throw std::runtime_error( - "hsa error code: " + std::to_string(err) + " " + errstr); - } + hsa_status_t err = hsa_init(); + if (err != HSA_STATUS_SUCCESS) { + const char* errstr = nullptr; + hsa_status_string(err, &errstr); + throw std::runtime_error("hsa error code: " + std::to_string(err) + " " + errstr); + } - auto status = rocmtools_initialize(); - RDC_LOG(RDC_INFO, "rocmtools_initialize status: " << status); + auto status = rocmtools_initialize(); + RDC_LOG(RDC_INFO, "rocmtools_initialize status: " << status); } RdcRocpBase::~RdcRocpBase() { - for (auto& session : sessions) { - const rdc_status_t status = destroy_session(session.first); - assert(status == RDC_ST_OK); - } - sessions.clear(); - auto status = rocmtools_finalize(); - RDC_LOG(RDC_INFO, "rocmtools_finalize status: " << status); + for (auto& session : sessions) { + const rdc_status_t status = destroy_session(session.first); + assert(status == RDC_ST_OK); + } + sessions.clear(); + auto status = rocmtools_finalize(); + RDC_LOG(RDC_INFO, "rocmtools_finalize status: " << status); - hsa_status_t err = hsa_shut_down(); - if (err != HSA_STATUS_SUCCESS) { - const char* errstr = nullptr; - hsa_status_string(err, &errstr); - // cannot throw an error here. print instead - RDC_LOG( - RDC_ERROR, "hsa error code: " + std::to_string(err) + " " + errstr); - } + hsa_status_t err = hsa_shut_down(); + if (err != HSA_STATUS_SUCCESS) { + const char* errstr = nullptr; + hsa_status_string(err, &errstr); + // cannot throw an error here. print instead + RDC_LOG(RDC_ERROR, "hsa error code: " + std::to_string(err) + " " + errstr); + } } -rdc_status_t RdcRocpBase::rocp_lookup( - pair_gpu_field_t gpu_field, - double* value) { - if (sessions.empty()) { - return RDC_ST_NOT_FOUND; - } +rdc_status_t RdcRocpBase::rocp_lookup(pair_gpu_field_t gpu_field, double* value) { + if (sessions.empty()) { + return RDC_ST_NOT_FOUND; + } - if (value == nullptr) { - return RDC_ST_BAD_PARAMETER; - } + if (value == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - rocmtools_device_profile_metric_t counter; - session_info_t session = sessions.at(gpu_field); - const rocmtools_status_t status = - rocmtools_device_profiling_session_poll(session.id, &counter); - session.stop_time = std::chrono::high_resolution_clock::now(); - if (status != ROCMTOOLS_STATUS_SUCCESS) { - return Rocp2RdcError(status); - } - const auto elapsed = std::chrono::duration_cast( - session.stop_time - session.start_time) - .count(); - // some metrics are derived from others and depend on time passed - switch (gpu_field.second) { - case RDC_FI_PROF_GFLOPS_16: - case RDC_FI_PROF_GFLOPS_32: - case RDC_FI_PROF_GFLOPS_64: - case RDC_FI_PROF_MEMR_BW_KBPNS: - case RDC_FI_PROF_MEMW_BW_KBPNS: - *value = counter.value.value / elapsed; - break; - default: - *value = counter.value.value; - break; - } + rocmtools_device_profile_metric_t counter; + session_info_t session = sessions.at(gpu_field); + const rocmtools_status_t status = rocmtools_device_profiling_session_poll(session.id, &counter); + session.stop_time = std::chrono::high_resolution_clock::now(); + if (status != ROCMTOOLS_STATUS_SUCCESS) { return Rocp2RdcError(status); + } + const auto elapsed = + std::chrono::duration_cast(session.stop_time - session.start_time) + .count(); + // some metrics are derived from others and depend on time passed + switch (gpu_field.second) { + case RDC_FI_PROF_GFLOPS_16: + case RDC_FI_PROF_GFLOPS_32: + case RDC_FI_PROF_GFLOPS_64: + case RDC_FI_PROF_MEMR_BW_KBPNS: + case RDC_FI_PROF_MEMW_BW_KBPNS: + *value = counter.value.value / elapsed; + break; + default: + *value = counter.value.value; + break; + } + return Rocp2RdcError(status); } rdc_status_t RdcRocpBase::create_session(pair_gpu_field_t gpu_field) { - if (sessions.count(gpu_field) != 0) { - RDC_LOG( - RDC_DEBUG, "Session for field (" << gpu_field.second << ") on GPU [" - << gpu_field.first + if (sessions.count(gpu_field) != 0) { + RDC_LOG(RDC_DEBUG, "Session for field (" << gpu_field.second << ") on GPU [" << gpu_field.first << "] already exists!"); - return RDC_ST_ALREADY_EXIST; - } + return RDC_ST_ALREADY_EXIST; + } - session_info_t session = {}; + session_info_t session = {}; - std::vector rocmtools_fields = { - counter_map_k.at(gpu_field.second)}; - // create session - rocmtools_status_t status = rocmtools_device_profiling_session_create( - rocmtools_fields.data(), rocmtools_fields.size(), &session.id, 0, - gpu_field.first); - - if (status != ROCMTOOLS_STATUS_SUCCESS) { - return Rocp2RdcError(status); - } - - // add start time - session.start_time = std::chrono::high_resolution_clock::now(); - sessions.emplace(gpu_field, session); - - // start session - status = rocmtools_device_profiling_session_start(session.id); + std::vector rocmtools_fields = {counter_map_k.at(gpu_field.second)}; + // create session + rocmtools_status_t status = rocmtools_device_profiling_session_create( + rocmtools_fields.data(), rocmtools_fields.size(), &session.id, 0, gpu_field.first); + if (status != ROCMTOOLS_STATUS_SUCCESS) { return Rocp2RdcError(status); + } + + // add start time + session.start_time = std::chrono::high_resolution_clock::now(); + sessions.emplace(gpu_field, session); + + // start session + status = rocmtools_device_profiling_session_start(session.id); + + return Rocp2RdcError(status); } rdc_status_t RdcRocpBase::destroy_session(pair_gpu_field_t gpu_field) { - if (sessions.empty()) { - RDC_LOG(RDC_DEBUG, "Cannot destroy empty session..."); - return RDC_ST_OK; - } + if (sessions.empty()) { + RDC_LOG(RDC_DEBUG, "Cannot destroy empty session..."); + return RDC_ST_OK; + } - // no session with field - if (sessions.count(gpu_field) == 0) { - RDC_LOG( - RDC_DEBUG, "Cannot destroy session with field (" - << gpu_field.second << ") on GPU [" - << gpu_field.first - << "] because it doesn't exist..."); - return RDC_ST_OK; - } + // no session with field + if (sessions.count(gpu_field) == 0) { + RDC_LOG(RDC_DEBUG, "Cannot destroy session with field (" << gpu_field.second << ") on GPU [" + << gpu_field.first + << "] because it doesn't exist..."); + return RDC_ST_OK; + } - const rocmtools_session_id_t session_id = sessions.at(gpu_field).id; - const rocmtools_status_t status = - rocmtools_device_profiling_session_destroy(session_id); - if (status == ROCMTOOLS_STATUS_SUCCESS) { - const auto num_of_destroyed_sessions = sessions.erase(gpu_field); - RDC_LOG( - RDC_DEBUG, - "destroyed (" << num_of_destroyed_sessions << ") sessions"); - } - return Rocp2RdcError(status); + const rocmtools_session_id_t session_id = sessions.at(gpu_field).id; + const rocmtools_status_t status = rocmtools_device_profiling_session_destroy(session_id); + if (status == ROCMTOOLS_STATUS_SUCCESS) { + const auto num_of_destroyed_sessions = sessions.erase(gpu_field); + RDC_LOG(RDC_DEBUG, "destroyed (" << num_of_destroyed_sessions << ") sessions"); + } + return Rocp2RdcError(status); } rdc_status_t RdcRocpBase::Rocp2RdcError(rocmtools_status_t rocm_status) { - switch (rocm_status) { - case ROCMTOOLS_STATUS_SUCCESS: - return RDC_ST_OK; - case ROCMTOOLS_STATUS_ERROR_HAS_ACTIVE_SESSION: - return RDC_ST_ALREADY_EXIST; - case ROCMTOOLS_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH: - case ROCMTOOLS_STATUS_ERROR_SESSION_MISSING_FILTER: - case ROCMTOOLS_STATUS_ERROR_SESSION_NOT_FOUND: - return RDC_ST_BAD_PARAMETER; - default: - return RDC_ST_UNKNOWN_ERROR; - } + switch (rocm_status) { + case ROCMTOOLS_STATUS_SUCCESS: + return RDC_ST_OK; + case ROCMTOOLS_STATUS_ERROR_HAS_ACTIVE_SESSION: + return RDC_ST_ALREADY_EXIST; + case ROCMTOOLS_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH: + case ROCMTOOLS_STATUS_ERROR_SESSION_MISSING_FILTER: + case ROCMTOOLS_STATUS_ERROR_SESSION_NOT_FOUND: + return RDC_ST_BAD_PARAMETER; + default: + return RDC_ST_UNKNOWN_ERROR; + } } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc index c7cba987de..c8441ba7c5 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include + #include #include #include @@ -37,104 +38,93 @@ amd::rdc::RdcRocpBase rocp; // get supported field ids // TODO: Query fields with rocprofiler -rdc_status_t rdc_telemetry_fields_query( - uint32_t field_ids[MAX_NUM_FIELDS], - uint32_t* field_count) { - // extract all keys from counter_map - std::vector counter_keys; - counter_keys.reserve(amd::rdc::counter_map_k.size()); - for (auto it : amd::rdc::counter_map_k) { - counter_keys.push_back(it.first); - } +rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) { + // extract all keys from counter_map + std::vector counter_keys; + counter_keys.reserve(amd::rdc::counter_map_k.size()); + for (auto it : amd::rdc::counter_map_k) { + counter_keys.push_back(it.first); + } - *field_count = counter_keys.size(); - // copy from vector into array - std::copy(counter_keys.begin(), counter_keys.end(), field_ids); + *field_count = counter_keys.size(); + // copy from vector into array + std::copy(counter_keys.begin(), counter_keys.end(), field_ids); - return RDC_ST_OK; + return RDC_ST_OK; } // Fetch -rdc_status_t rdc_telemetry_fields_value_get( - rdc_gpu_field_t* fields, - uint32_t fields_count, - rdc_field_value_f callback, - void* user_data) { - // - // Bulk fetch fields - std::vector bulk_results; +rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data) { + // + // Bulk fetch fields + std::vector bulk_results; - struct timeval tv {}; - gettimeofday(&tv, nullptr); - const uint64_t curTime = - static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + struct timeval tv {}; + gettimeofday(&tv, nullptr); + const uint64_t curTime = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; - // Fetch it one by one for left fields - const int BULK_FIELDS_MAX = 16; - rdc_gpu_field_value_t values[BULK_FIELDS_MAX]; - uint32_t bulk_count = 0; - rdc_status_t status = RDC_ST_UNKNOWN_ERROR; - double value = 0; - for (uint32_t i = 0; i < fields_count; i++) { - if (bulk_count >= BULK_FIELDS_MAX) { - status = callback(values, bulk_count, user_data); - // When the callback returns errors, stop processing and return. - if (status != RDC_ST_OK) { - return status; - } - bulk_count = 0; - } - - status = rocp.rocp_lookup( - std::make_pair(fields[i].gpu_index, fields[i].field_id), &value); - - // get value - values[bulk_count].gpu_index = fields[i].gpu_index; - values[bulk_count].field_value.type = DOUBLE; - values[bulk_count].field_value.status = status; - values[bulk_count].field_value.ts = curTime; - values[bulk_count].field_value.value.dbl = value; - values[bulk_count].field_value.field_id = fields[i].field_id; - bulk_count++; - } - if (bulk_count != 0) { - rdc_status_t status = callback(values, bulk_count, user_data); - if (status != RDC_ST_OK) { - return status; - } - bulk_count = 0; + // Fetch it one by one for left fields + const int BULK_FIELDS_MAX = 16; + rdc_gpu_field_value_t values[BULK_FIELDS_MAX]; + uint32_t bulk_count = 0; + rdc_status_t status = RDC_ST_UNKNOWN_ERROR; + double value = 0; + for (uint32_t i = 0; i < fields_count; i++) { + if (bulk_count >= BULK_FIELDS_MAX) { + status = callback(values, bulk_count, user_data); + // When the callback returns errors, stop processing and return. + if (status != RDC_ST_OK) { + return status; + } + bulk_count = 0; } - return status; + status = rocp.rocp_lookup(std::make_pair(fields[i].gpu_index, fields[i].field_id), &value); + + // get value + values[bulk_count].gpu_index = fields[i].gpu_index; + values[bulk_count].field_value.type = DOUBLE; + values[bulk_count].field_value.status = status; + values[bulk_count].field_value.ts = curTime; + values[bulk_count].field_value.value.dbl = value; + values[bulk_count].field_value.field_id = fields[i].field_id; + bulk_count++; + } + if (bulk_count != 0) { + rdc_status_t status = callback(values, bulk_count, user_data); + if (status != RDC_ST_OK) { + return status; + } + bulk_count = 0; + } + + return status; } -rdc_status_t rdc_telemetry_fields_watch( - rdc_gpu_field_t* fields, - uint32_t fields_count) { - rdc_status_t status = RDC_ST_OK; - for (uint32_t i = 0; i < fields_count; i++) { - RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id); - const rdc_status_t temp_status = rocp.create_session( - std::make_pair(fields[i].gpu_index, fields[i].field_id)); - if (temp_status != RDC_ST_OK) { - status = temp_status; - } +rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { + rdc_status_t status = RDC_ST_OK; + for (uint32_t i = 0; i < fields_count; i++) { + RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id); + const rdc_status_t temp_status = + rocp.create_session(std::make_pair(fields[i].gpu_index, fields[i].field_id)); + if (temp_status != RDC_ST_OK) { + status = temp_status; } - return status; + } + return status; } -rdc_status_t rdc_telemetry_fields_unwatch( - rdc_gpu_field_t* fields, - uint32_t fields_count) { - rdc_status_t status = RDC_ST_OK; - for (uint32_t i = 0; i < fields_count; i++) { - RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id); - const rdc_status_t temp_status = rocp.destroy_session( - std::make_pair(fields[i].gpu_index, fields[i].field_id)); - // return last non-ok status - if (temp_status != RDC_ST_OK) { - status = temp_status; - } +rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) { + rdc_status_t status = RDC_ST_OK; + for (uint32_t i = 0; i < fields_count; i++) { + RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id); + const rdc_status_t temp_status = + rocp.destroy_session(std::make_pair(fields[i].gpu_index, fields[i].field_id)); + // return last non-ok status + if (temp_status != RDC_ST_OK) { + status = temp_status; } - return status; + } + return status; } diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/ComputeQueueTest.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/ComputeQueueTest.cc old mode 100755 new mode 100644 index 42ed69ba0d..b05c4fbb8a --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/ComputeQueueTest.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/ComputeQueueTest.cc @@ -19,30 +19,32 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_modules/rdc_rocr/ComputeQueueTest.h" + #include +#include #include #include -#include #include -#include -#include #include +#include #include -#include #include -#include "rdc_modules/rdc_rocr/common.h" -#include "rdc_modules/rdc_rocr/ComputeQueueTest.h" -#include "rdc_modules/rdc_rocr/base_rocr_utils.h" +#include +#include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" +#include "rdc_modules/rdc_rocr/base_rocr_utils.h" +#include "rdc_modules/rdc_rocr/common.h" namespace amd { namespace rdc { static const uint32_t kNumBufferElements = 256; -ComputeQueueTest::ComputeQueueTest(uint32_t gpu_index): TestBase(gpu_index) { +ComputeQueueTest::ComputeQueueTest(uint32_t gpu_index) : TestBase(gpu_index) { set_num_iteration(10); // Number of iterations to execute of the main test; // This is a default value which can be overridden // on the command line. @@ -50,8 +52,7 @@ ComputeQueueTest::ComputeQueueTest(uint32_t gpu_index): TestBase(gpu_index) { set_description("This test will run binary search compute task via AQL."); } -ComputeQueueTest::~ComputeQueueTest(void) { -} +ComputeQueueTest::~ComputeQueueTest(void) {} // Any 1-time setup involving member variables used in the rest of the test // should be done here. @@ -61,7 +62,7 @@ hsa_status_t ComputeQueueTest::SetUp(void) { TestBase::SetUp(); err = SetDefaultAgents(this); - if ( err != HSA_STATUS_SUCCESS) return err; + if (err != HSA_STATUS_SUCCESS) return err; err = SetPoolsTypical(this); return err; @@ -77,9 +78,7 @@ void ComputeQueueTest::Run(void) { TestBase::Run(); } -void ComputeQueueTest::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void ComputeQueueTest::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void ComputeQueueTest::DisplayResults(void) const { // Compare required profile for this test case with what we're actually @@ -112,8 +111,8 @@ void ComputeQueueTest::InitializeBinarySearch(BinarySearch* bs) { // This function shows how to do an asynchronous copy. We have to create a // signal and use the signal to notify us when the copy has completed. -hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src, - size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) { +hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src, size_t size, + hsa_agent_t dst_ag, hsa_agent_t src_ag) { hsa_signal_t s; hsa_status_t err; @@ -123,8 +122,8 @@ hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src, err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s); throw_if_error(err); - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { err = HSA_STATUS_ERROR; RDC_LOG(RDC_ERROR, "Async copy signal error"); @@ -141,22 +140,19 @@ hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src, hsa_status_t ComputeQueueTest::FindPools(BinarySearch* bs) { hsa_status_t err; - err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool, - &bs->cpu_pool); + err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool, &bs->cpu_pool); if (err != HSA_STATUS_INFO_BREAK) { return HSA_STATUS_ERROR; } - err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool, - &bs->gpu_pool); + err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool, &bs->gpu_pool); if (err != HSA_STATUS_INFO_BREAK) { return HSA_STATUS_ERROR; } - err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, - FindKernArgPool, &bs->kern_arg_pool); + err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindKernArgPool, &bs->kern_arg_pool); if (err != HSA_STATUS_INFO_BREAK) { return HSA_STATUS_ERROR; @@ -203,7 +199,7 @@ hsa_status_t ComputeQueueTest::AllocateAndInitBuffers(BinarySearch* bs) { (void)memset(bs->input, 0, in_length); err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0, - reinterpret_cast(&bs->input_arr_local)); + reinterpret_cast(&bs->input_arr_local)); throw_if_error(err); err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local); throw_if_error(err); @@ -218,7 +214,7 @@ hsa_status_t ComputeQueueTest::AllocateAndInitBuffers(BinarySearch* bs) { for (uint32_t i = 1; i < bs->length; ++i) { bs->input[i] = bs->input[i - 1] + - static_cast(max * rand_r(&seed) / static_cast(RAND_MAX)); + static_cast(max * rand_r(&seed) / static_cast(RAND_MAX)); } return err; @@ -238,11 +234,10 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) { err = hsa_agent_get_info(bs->gpu_dev, HSA_AGENT_INFO_NAME, agent_name); throw_if_error(err); - std::string kernel_file = search_hsaco_full_path( - bs->kernel_file_name.c_str(), agent_name); + std::string kernel_file = search_hsaco_full_path(bs->kernel_file_name.c_str(), agent_name); if (kernel_file == "") { - RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() << - " at line " << __LINE__ << ", errno: " << errno); + RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() << " at line " << __LINE__ + << ", errno: " << errno); std::string msg("fail to open "); msg += bs->kernel_file_name; throw_if_skip(msg); @@ -251,8 +246,8 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) { hsa_file_t file_handle = open(kernel_file.c_str(), O_RDONLY); if (file_handle == -1) { - RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() << - " at line " << __LINE__ << ", errno: " << errno); + RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() << " at line " << __LINE__ + << ", errno: " << errno); return HSA_STATUS_ERROR; } @@ -260,46 +255,40 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) { throw_if_error(err); close(file_handle); - err = hsa_executable_create_alt(HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable); + err = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, + &executable); throw_if_error(err); - err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev, - code_obj_rdr, NULL, NULL); + err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev, code_obj_rdr, NULL, NULL); throw_if_error(err); err = hsa_executable_freeze(executable, NULL); throw_if_error(err); hsa_executable_symbol_t kern_sym; - err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(), - bs->gpu_dev, 0, &kern_sym); + err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(), bs->gpu_dev, 0, + &kern_sym); throw_if_error(err); - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, - &bs->kernel_object); + err = hsa_executable_symbol_get_info(kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &bs->kernel_object); throw_if_error(err); - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &bs->private_segment_size); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &bs->private_segment_size); throw_if_error(err); - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &bs->group_segment_size); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &bs->group_segment_size); throw_if_error(err); // Remaining queries not supported on code object v3. - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, - &bs->kernarg_size); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &bs->kernarg_size); throw_if_error(err); - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, - &bs->kernarg_align); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, &bs->kernarg_align); throw_if_error(err); assert(bs->kernarg_align >= 16 && "Reported kernarg size is too small."); bs->kernarg_align = (bs->kernarg_align == 0) ? 16 : bs->kernarg_align; @@ -310,7 +299,7 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) { // This function populates the AQL patch with the information // we have collected and stored in the BinarySearch structure thus far. void ComputeQueueTest::PopulateAQLPacket(BinarySearch const* bs, - hsa_kernel_dispatch_packet_t* aql) { + hsa_kernel_dispatch_packet_t* aql) { aql->header = 0; // Dummy val. for now. Set this right before doorbell ring aql->setup = 1; aql->workgroup_size_x = bs->work_group_size; @@ -326,8 +315,7 @@ void ComputeQueueTest::PopulateAQLPacket(BinarySearch const* bs, aql->completion_signal = bs->signal; } -void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, - hsa_queue_t* q) { +void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, hsa_queue_t* q) { void* queue_base = q->base_address; const uint32_t queue_mask = q->size - 1; uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1); @@ -335,8 +323,7 @@ void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aq hsa_kernel_dispatch_packet_t* queue_aql_packet; queue_aql_packet = - &(reinterpret_cast(queue_base)) - [que_idx & queue_mask]; + &(reinterpret_cast(queue_base))[que_idx & queue_mask]; queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x; queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y; @@ -351,11 +338,10 @@ void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aq queue_aql_packet->completion_signal = in_aql->completion_signal; } - // This function allocates memory from the kern_arg pool we already found, and // then sets the argument values needed by the kernel code. -hsa_status_t ComputeQueueTest::AllocAndSetKernArgs(BinarySearch* bs, void* args, - size_t arg_size, void** aql_buf_ptr) { +hsa_status_t ComputeQueueTest::AllocAndSetKernArgs(BinarySearch* bs, void* args, size_t arg_size, + void** aql_buf_ptr) { void* kern_arg_buf = nullptr; hsa_status_t err; size_t buf_size; @@ -448,11 +434,9 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { uint32_t global_lower_bound = 0; uint32_t global_upper_bound = bs->length - 1; - uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) / - bs->num_sub_divisions; + uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) / bs->num_sub_divisions; - if ((bs->input[0] > bs->find_me) || - (bs->input[bs->length - 1] < bs->find_me)) { + if ((bs->input[0] > bs->find_me) || (bs->input[bs->length - 1] < bs->find_me)) { bs->output[0] = 0; bs->output[1] = bs->length - 1; bs->output[2] = 0; @@ -472,7 +456,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { typedef uint32_t uint4[4]; struct __attribute__((aligned(16))) local_args_t { uint4* outputArray; - uint2* sortedArray; + uint2* sortedArray; uint32_t findMe; uint32_t pad; uint64_t global_offset_x; @@ -494,8 +478,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { local_args.completion_action = 0; // Copy the kernel args structure into kernel arg memory - err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args), - &bs->kern_arg_address); + err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args), &bs->kern_arg_address); throw_if_error(err); // Populate an AQL packet with the info we've gathered @@ -505,7 +488,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t); while ((sub_div_size > 1) && (bs->output[3] != 0)) { - for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) { + for (uint32_t i = 0; i < bs->num_sub_divisions; i++) { int idx1 = i * sub_div_size; int idx2 = ((i + 1) * sub_div_size) - 1; bs->input_arr[2 * i] = bs->input[idx1]; @@ -513,9 +496,9 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { } // Copy kernel parameter from system memory to local memory - err = AgentMemcpy(reinterpret_cast(bs->input_arr_local), - reinterpret_cast(bs->input_arr), - in_length, bs->gpu_dev, bs->cpu_dev); + err = + AgentMemcpy(reinterpret_cast(bs->input_arr_local), + reinterpret_cast(bs->input_arr), in_length, bs->gpu_dev, bs->cpu_dev); throw_if_error(err); @@ -535,10 +518,8 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { WriteAQLToQueue(&aql, bs->queue); uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; - aql_header |= HSA_FENCE_SCOPE_SYSTEM << - HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - aql_header |= HSA_FENCE_SCOPE_SYSTEM << - HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + aql_header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + aql_header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; // Set the packet's type, acquire and release fences. This should be done // atomically after all the other fields have been set, using release @@ -546,9 +527,9 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { // signal is activated. void* q_base = bs->queue->base_address; - AtomicSetPacketHeader(aql_header, aql.setup, - &(reinterpret_cast - (q_base))[que_idx & mask]); + AtomicSetPacketHeader( + aql_header, aql.setup, + &(reinterpret_cast(q_base))[que_idx & mask]); // Increment the write index and ring the doorbell to dispatch kernel. hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1)); @@ -563,9 +544,8 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { // the queue is less than 1. When the kernel associated with the queued AQL // packet has completed execution, the signal value is automatically // decremented by the packet processor. - hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal, - HSA_SIGNAL_CONDITION_LT, 1, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal, HSA_SIGNAL_CONDITION_LT, 1, + UINT64_MAX, HSA_WAIT_STATE_BLOCKED); // value should be 0, or we timed-out if (value) { @@ -579,8 +559,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) { // Binary search algorithm stuff... global_lower_bound = bs->output[0] * sub_div_size; global_upper_bound = global_lower_bound + sub_div_size - 1; - sub_div_size = (global_upper_bound - global_lower_bound + 1) / - bs->num_sub_divisions; + sub_div_size = (global_upper_bound - global_lower_bound + 1) / bs->num_sub_divisions; } uint32_t element_index = UINT_MAX; @@ -655,8 +634,8 @@ hsa_status_t ComputeQueueTest::RunBinarySearchTest(void) { err = hsa_signal_create(1, 0, NULL, &bs.signal); throw_if_error(err, "Fail to create signal."); - err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL, - UINT32_MAX, UINT32_MAX, &bs.queue); + err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, + &bs.queue); throw_if_error(err, "Fail to create queue."); err = FindPools(&bs); diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryAccess.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryAccess.cc old mode 100755 new mode 100644 index 51a8c75dbb..8a9d4ffc15 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryAccess.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryAccess.cc @@ -20,34 +20,36 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_modules/rdc_rocr/MemoryAccess.h" + #include + #include #include -#include #include +#include -#include "rdc_modules/rdc_rocr/common.h" -#include "rdc_modules/rdc_rocr/MemoryAccess.h" -#include "rdc_modules/rdc_rocr/base_rocr_utils.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" +#include "rdc_modules/rdc_rocr/base_rocr_utils.h" +#include "rdc_modules/rdc_rocr/common.h" namespace amd { namespace rdc { -MemoryAccessTest::MemoryAccessTest(uint32_t gpu_index): TestBase(gpu_index) { +MemoryAccessTest::MemoryAccessTest(uint32_t gpu_index) : TestBase(gpu_index) { set_num_iteration(10); // Number of iterations to execute of the main test; // This is a default value which can be overridden // on the command line. set_title("RocR Memory Access Tests"); - set_description("This series of tests check memory allocation" - "on GPU and CPU, i.e. GPU access to system memory " - "and CPU access to GPU memory."); + set_description( + "This series of tests check memory allocation" + "on GPU and CPU, i.e. GPU access to system memory " + "and CPU access to GPU memory."); } -MemoryAccessTest::~MemoryAccessTest(void) { -} +MemoryAccessTest::~MemoryAccessTest(void) {} // Any 1-time setup involving member variables used in the rest of the test // should be done here. @@ -74,9 +76,7 @@ void MemoryAccessTest::Run(void) { TestBase::Run(); } -void MemoryAccessTest::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void MemoryAccessTest::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void MemoryAccessTest::DisplayResults(void) const { // Compare required profile for this test case with what we're actually @@ -92,18 +92,17 @@ void MemoryAccessTest::Close() { TestBase::Close(); } +typedef struct __attribute__((aligned(16))) args_t { + int* a; + int* b; + int* c; +} args; -typedef struct __attribute__ ((aligned(16))) args_t { - int *a; - int *b; - int *c; - } args; - - args *kernArgs = NULL; +args* kernArgs = NULL; static const char kSubTestSeparator[] = " **************************"; -static void PrintMemorySubtestHeader(const char *header) { +static void PrintMemorySubtestHeader(const char* header) { RDC_LOG(RDC_DEBUG, " *** Memory Subtest: " << header << " ***"); } @@ -113,80 +112,64 @@ static const int kMemoryAllocSize = 8; static const int kMemoryAllocSize = 1024; #endif - // Test to check GPU can read & write to system memory -void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, - hsa_agent_t gpuAgent) { +void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent) { hsa_status_t err; // Get Global Memory Pool on the gpuAgent to allocate gpu buffers hsa_amd_memory_pool_t gpu_pool; - err = hsa_amd_agent_iterate_memory_pools(gpuAgent, - GetGlobalMemoryPool, - &gpu_pool); + err = hsa_amd_agent_iterate_memory_pools(gpuAgent, GetGlobalMemoryPool, &gpu_pool); throw_if_error(err); hsa_amd_memory_pool_access_t access; - hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &access); + hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &access); if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { // hsa objects - hsa_queue_t *queue = NULL; // command queue + hsa_queue_t* queue = NULL; // command queue hsa_signal_t signal = {0}; // completion signal // get queue size uint32_t queue_size = 0; - err = hsa_agent_get_info(gpuAgent, - HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); + err = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); throw_if_error(err); // create queue - err = hsa_queue_create(gpuAgent, - queue_size, HSA_QUEUE_TYPE_MULTI, - NULL, NULL, 0, 0, &queue); + err = hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue); throw_if_error(err); // Get System Memory Pool on the cpuAgent to allocate host side buffers hsa_amd_memory_pool_t global_pool; - err = hsa_amd_agent_iterate_memory_pools(cpuAgent, - GetGlobalMemoryPool, - &global_pool); + err = hsa_amd_agent_iterate_memory_pools(cpuAgent, GetGlobalMemoryPool, &global_pool); throw_if_error(err); // Find a memory pool that supports kernel arguments. hsa_amd_memory_pool_t kernarg_pool; - err = hsa_amd_agent_iterate_memory_pools(cpuAgent, - GetKernArgMemoryPool, - &kernarg_pool); + err = hsa_amd_agent_iterate_memory_pools(cpuAgent, GetKernArgMemoryPool, &kernarg_pool); throw_if_error(err); // Allocate the host side buffers // (sys_data,dup_sys_data,cpuResult,kernArg) on system memory - int *sys_data = NULL; - int *dup_sys_data = NULL; - int *cpuResult = NULL; - int *gpuResult = NULL; + int* sys_data = NULL; + int* dup_sys_data = NULL; + int* cpuResult = NULL; + int* gpuResult = NULL; - err = hsa_amd_memory_pool_allocate(global_pool, - kMemoryAllocSize, 0, - reinterpret_cast(&cpuResult)); + err = hsa_amd_memory_pool_allocate(global_pool, kMemoryAllocSize, 0, + reinterpret_cast(&cpuResult)); throw_if_error(err); - err = hsa_amd_memory_pool_allocate(global_pool, - kMemoryAllocSize, 0, - reinterpret_cast(&sys_data)); + err = hsa_amd_memory_pool_allocate(global_pool, kMemoryAllocSize, 0, + reinterpret_cast(&sys_data)); throw_if_error(err); - err = hsa_amd_memory_pool_allocate(global_pool, - kMemoryAllocSize, 0, - reinterpret_cast(&dup_sys_data)); + err = hsa_amd_memory_pool_allocate(global_pool, kMemoryAllocSize, 0, + reinterpret_cast(&dup_sys_data)); throw_if_error(err); - // Allocate the kernel argument buffer from the kernarg_pool. err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0, - reinterpret_cast(&kernArgs)); + reinterpret_cast(&kernArgs)); throw_if_error(err); // initialize the host buffers @@ -204,10 +187,9 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, // Get local memory of GPU to allocate device side buffers err = hsa_amd_memory_pool_allocate(gpu_pool, kMemoryAllocSize, 0, - reinterpret_cast(&gpuResult)); + reinterpret_cast(&gpuResult)); throw_if_error(err); - // Allow cpuAgent access to all allocated GPU memory. err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpuResult); throw_if_error(err); @@ -227,7 +209,6 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, kernArgs->b = cpuResult; // system memory passed to gpu for write kernArgs->c = gpuResult; // gpu memory to verify that gpu read system data - // Create the executable, get symbol by name and load the code object set_kernel_file_name("gpuReadWrite_kernels.hsaco"); set_kernel_name("gpuReadWrite"); @@ -268,22 +249,22 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, WriteAQLToQueueLoc(queue, index, &aql); - hsa_kernel_dispatch_packet_t *q_base_addr = - reinterpret_cast(queue->base_address); + hsa_kernel_dispatch_packet_t* q_base_addr = + reinterpret_cast(queue->base_address); AtomicSetPacketHeader( (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE), - (1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), - reinterpret_cast - (&q_base_addr[index & queue_mask])); + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE), + (1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS), + reinterpret_cast(&q_base_addr[index & queue_mask])); // ringdoor bell hsa_signal_store_relaxed(queue->doorbell_signal, index); // wait for the signal and reset it for future use - while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) { } + while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_ACTIVE)) { + } hsa_signal_store_relaxed(signal, 1); // compare device and host side results @@ -292,8 +273,7 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, } for (int i = 0; i < kMemoryAllocSize; ++i) { if (gpuResult[i] != dup_sys_data[i]) { - throw_if_error(HSA_STATUS_ERROR, - "gpuResult does not match dup_sys_data."); + throw_if_error(HSA_STATUS_ERROR, "gpuResult does not match dup_sys_data."); } } @@ -304,7 +284,7 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, for (int i = 0; i < kMemoryAllocSize; ++i) { if (cpuResult[i] != i) { throw_if_error(HSA_STATUS_ERROR, - "The CPU memory size does not match the system memory size."); + "The CPU memory size does not match the system memory size."); } } @@ -312,27 +292,39 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, RDC_LOG(RDC_DEBUG, "gpu has written to system memory successfully"); } - if (sys_data) { hsa_memory_free(sys_data); } - if (dup_sys_data) { hsa_memory_free(dup_sys_data); } - if (cpuResult) {hsa_memory_free(cpuResult); } - if (gpuResult) {hsa_memory_free(gpuResult); } - if (kernArgs) { hsa_memory_free(kernArgs); } - if (signal.handle) { hsa_signal_destroy(signal); } - if (queue) { hsa_queue_destroy(queue); } + if (sys_data) { + hsa_memory_free(sys_data); + } + if (dup_sys_data) { + hsa_memory_free(dup_sys_data); + } + if (cpuResult) { + hsa_memory_free(cpuResult); + } + if (gpuResult) { + hsa_memory_free(gpuResult); + } + if (kernArgs) { + hsa_memory_free(kernArgs); + } + if (signal.handle) { + hsa_signal_destroy(signal); + } + if (queue) { + hsa_queue_destroy(queue); + } } else { if (verbosity() > 0) { - RDC_LOG(RDC_DEBUG, - "Test not applicable as system is not large bar, skipping"); + RDC_LOG(RDC_DEBUG, "Test not applicable as system is not large bar, skipping"); } return; } } // Test to check cpu can read & write to GPU memory -void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, - hsa_agent_t, - hsa_amd_memory_pool_t pool) { +void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t, + hsa_amd_memory_pool_t pool) { hsa_status_t err; pool_info_t pool_i; @@ -340,14 +332,12 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, throw_if_error(err); if (pool_i.segment == HSA_AMD_SEGMENT_GLOBAL && - pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { + pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { hsa_amd_memory_pool_access_t access; - hsa_amd_agent_memory_pool_get_info(cpuAgent, pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &access); + hsa_amd_agent_memory_pool_get_info(cpuAgent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &access); if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { - if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || - pool_i.alloc_alignment == 0) { + if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || pool_i.alloc_alignment == 0) { if (verbosity() > 0) { RDC_LOG(RDC_DEBUG, "Test not applicable. Skipping."); } @@ -356,10 +346,10 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, auto gran_sz = pool_i.alloc_granule; auto pool_sz = pool_i.size / gran_sz; - auto max_alloc_size = pool_sz/2; - unsigned int max_element = max_alloc_size/sizeof(unsigned int); - unsigned int *gpu_data; - unsigned int *sys_data; + auto max_alloc_size = pool_sz / 2; + unsigned int max_element = max_alloc_size / sizeof(unsigned int); + unsigned int* gpu_data; + unsigned int* sys_data; sys_data = (unsigned int*)malloc(max_alloc_size); memset(sys_data, 0, max_alloc_size); for (unsigned int i = 1; i <= max_element; ++i) { @@ -368,7 +358,7 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, // err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, sys_data); // EXPECT_EQ(err, HSA_STATUS_SUCCESS); err = hsa_amd_memory_pool_allocate(pool, max_alloc_size, 0, - reinterpret_cast(&gpu_data)); + reinterpret_cast(&gpu_data)); throw_if_error(err); /* if (err == HSA_STATUS_ERROR) { @@ -385,21 +375,22 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, gpu_data[i] = i; // Write to gpu memory directly } - for (unsigned int i = 1; i <= max_element; ++i) { - if (sys_data[i] != gpu_data[i]) { // Reading GPU memory - fprintf(stdout, "Values not mathing !! sys_data[%d]:%d ," - "gpu_data[%d]\n", sys_data[i], i, gpu_data[i]); - } - } - RDC_LOG(RDC_DEBUG, "CPU have read & write to GPU memory successfully"); - err = hsa_amd_memory_pool_free(gpu_data); - free(sys_data); - } else { - if (verbosity() > 0) { - RDC_LOG(RDC_DEBUG, - "Test not applicable as system is not large bar, Skipping."); + for (unsigned int i = 1; i <= max_element; ++i) { + if (sys_data[i] != gpu_data[i]) { // Reading GPU memory + fprintf(stdout, + "Values not mathing !! sys_data[%d]:%d ," + "gpu_data[%d]\n", + sys_data[i], i, gpu_data[i]); } - return; + } + RDC_LOG(RDC_DEBUG, "CPU have read & write to GPU memory successfully"); + err = hsa_amd_memory_pool_free(gpu_data); + free(sys_data); + } else { + if (verbosity() > 0) { + RDC_LOG(RDC_DEBUG, "Test not applicable as system is not large bar, Skipping."); + } + return; } } } @@ -416,12 +407,10 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(void) { std::vector gpus; err = hsa_iterate_agents(IterateGPUAgents, &gpus); throw_if_error(err); - for (unsigned int i = 0 ; i< gpus.size(); ++i) { + for (unsigned int i = 0; i < gpus.size(); ++i) { hsa_amd_memory_pool_t gpu_pool; memset(&gpu_pool, 0, sizeof(gpu_pool)); - err = hsa_amd_agent_iterate_memory_pools(gpus[i], - GetGlobalMemoryPool, - &gpu_pool); + err = hsa_amd_agent_iterate_memory_pools(gpus[i], GetGlobalMemoryPool, &gpu_pool); throw_if_error(err); if (gpu_pool.handle == 0) { RDC_LOG(RDC_DEBUG, "no global mempool in gpu agent"); @@ -466,4 +455,4 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(void) { } } // namespace rdc -} // namespace amd \ No newline at end of file +} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryTest.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryTest.cc old mode 100755 new mode 100644 index 4c5b09f142..c12478d71a --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryTest.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/MemoryTest.cc @@ -20,32 +20,35 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_modules/rdc_rocr/MemoryTest.h" + #include #include -#include #include -#include "rdc_modules/rdc_rocr/common.h" -#include "rdc_modules/rdc_rocr/MemoryTest.h" -#include "rdc_modules/rdc_rocr/base_rocr_utils.h" +#include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" +#include "rdc_modules/rdc_rocr/base_rocr_utils.h" +#include "rdc_modules/rdc_rocr/common.h" namespace amd { namespace rdc { static const uint32_t kNumBufferElements = 256; -MemoryTest::MemoryTest(uint32_t gpu_index): TestBase(gpu_index) { +MemoryTest::MemoryTest(uint32_t gpu_index) : TestBase(gpu_index) { set_num_iteration(10); // Number of iterations to execute of the main test; // This is a default value which can be overridden // on the command line. set_title("Max Single Allocation Memory Test"); - set_description("This series of tests check memory allocation limits, extent" - " of GPU access to system memory and other memory related functionality."); + set_description( + "This series of tests check memory allocation limits, extent" + " of GPU access to system memory and other memory related " + "functionality."); } -MemoryTest::~MemoryTest(void) { -} +MemoryTest::~MemoryTest(void) {} // Any 1-time setup involving member variables used in the rest of the test // should be done here. @@ -55,7 +58,7 @@ hsa_status_t MemoryTest::SetUp(void) { TestBase::SetUp(); err = SetDefaultAgents(this); - if ( err != HSA_STATUS_SUCCESS) return err; + if (err != HSA_STATUS_SUCCESS) return err; err = SetPoolsTypical(this); return err; @@ -71,9 +74,7 @@ void MemoryTest::Run(void) { TestBase::Run(); } -void MemoryTest::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void MemoryTest::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void MemoryTest::DisplayResults(void) const { // Compare required profile for this test case with what we're actually @@ -92,7 +93,7 @@ void MemoryTest::Close() { } hsa_status_t MemoryTest::TestAllocate(hsa_amd_memory_pool_t pool, size_t sz) { - void *ptr; + void* ptr; hsa_status_t err; err = hsa_amd_memory_pool_allocate(pool, sz, 0, &ptr); @@ -106,13 +107,12 @@ hsa_status_t MemoryTest::TestAllocate(hsa_amd_memory_pool_t pool, size_t sz) { static const char kSubTestSeparator[] = " **************************"; -static void PrintMemorySubtestHeader(const char *header) { +static void PrintMemorySubtestHeader(const char* header) { RDC_LOG(RDC_DEBUG, " *** Memory Subtest: " << header << " ***"); } // Test Fixtures -hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag, - hsa_amd_memory_pool_t pool) { +hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag, hsa_amd_memory_pool_t pool) { hsa_status_t err = HSA_STATUS_SUCCESS; pool_info_t pool_i; @@ -142,19 +142,17 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag, device_type = "DSP"; break; } - RDC_LOG(RDC_DEBUG, " Agent: " << ag_name << " Node " << node << " (" - << device_type << ")"); + RDC_LOG(RDC_DEBUG, " Agent: " << ag_name << " Node " << node << " (" << device_type << ")"); } err = AcquirePoolInfo(pool, &pool_i); if (err != HSA_STATUS_SUCCESS) return err; if (verbosity() > 0) { - DumpMemoryPoolInfo(&pool_i, 2); + DumpMemoryPoolInfo(&pool_i, 2); } - if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || - pool_i.alloc_alignment == 0) { + if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || pool_i.alloc_alignment == 0) { if (verbosity() > 0) { RDC_LOG(RDC_DEBUG, " Test not applicable. Skipping."); } @@ -165,25 +163,24 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag, auto pool_sz = pool_i.aggregate_alloc_max / gran_sz; // Neg. test: Try to allocate more than the pool size - err = TestAllocate(pool, pool_sz*gran_sz + gran_sz); + err = TestAllocate(pool, pool_sz * gran_sz + gran_sz); if (err != HSA_STATUS_ERROR_INVALID_ALLOCATION) return err; - auto max_alloc_size = pool_sz/2; + auto max_alloc_size = pool_sz / 2; uint64_t upper_bound = pool_sz; uint64_t lower_bound = 0; while (true) { err = TestAllocate(pool, max_alloc_size * gran_sz); - if (err != HSA_STATUS_SUCCESS || - err != HSA_STATUS_ERROR_OUT_OF_RESOURCES) return err; + if (err != HSA_STATUS_SUCCESS || err != HSA_STATUS_ERROR_OUT_OF_RESOURCES) return err; if (err == HSA_STATUS_SUCCESS) { lower_bound = max_alloc_size; - max_alloc_size += (upper_bound - lower_bound)/2; + max_alloc_size += (upper_bound - lower_bound) / 2; } else if (err == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { upper_bound = max_alloc_size; - max_alloc_size -= (upper_bound - lower_bound)/2; + max_alloc_size -= (upper_bound - lower_bound) / 2; } if ((upper_bound - lower_bound) < 2) { @@ -197,15 +194,14 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag, } if (verbosity() > 0) { - RDC_LOG(RDC_DEBUG, " Biggest single allocation size for this pool is " << - (max_alloc_size * gran_sz)/1024 << "KB."); - RDC_LOG(RDC_DEBUG, " This is " << - static_cast(max_alloc_size)/pool_sz*100 << - "% of the total."); + RDC_LOG(RDC_DEBUG, " Biggest single allocation size for this pool is " + << (max_alloc_size * gran_sz) / 1024 << "KB."); + RDC_LOG(RDC_DEBUG, " This is " << static_cast(max_alloc_size) / pool_sz * 100 + << "% of the total."); } if (ag_type == HSA_DEVICE_TYPE_GPU) { - if ((float)max_alloc_size/pool_sz < (float)15/16) { + if ((float)max_alloc_size / pool_sz < (float)15 / 16) { RDC_LOG(RDC_ERROR, "the allocate size is wrong"); throw_if_error(HSA_STATUS_ERROR, "The allocate size is wrong"); } @@ -233,8 +229,7 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(void) { auto pool_idx = 0; for (auto a : agent_pools) { - if (a->agent.handle != current_gpu.handle) - continue; + if (a->agent.handle != current_gpu.handle) continue; for (auto p : a->pools) { pool_idx++; RDC_LOG(RDC_DEBUG, " Pool " << pool_idx << ":"); diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc index f91b1034f3..8f03449e06 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc @@ -19,180 +19,168 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_modules/rdc_rocr/RdcDiagnosticLib.h" + #include + #include #include + #include "rdc_lib/rdc_common.h" -#include "rdc_modules/rdc_rocr/common.h" -#include "rdc_modules/rdc_rocr/RdcDiagnosticLib.h" -#include "rdc_modules/rdc_rocr/MemoryTest.h" -#include "rdc_modules/rdc_rocr/MemoryAccess.h" #include "rdc_modules/rdc_rocr/ComputeQueueTest.h" +#include "rdc_modules/rdc_rocr/MemoryAccess.h" +#include "rdc_modules/rdc_rocr/MemoryTest.h" +#include "rdc_modules/rdc_rocr/common.h" -rdc_status_t rdc_diag_init(uint64_t) { - return RDC_ST_OK; -} +rdc_status_t rdc_diag_init(uint64_t) { return RDC_ST_OK; } -rdc_status_t rdc_diag_destroy() { - return RDC_ST_OK; -} +rdc_status_t rdc_diag_destroy() { return RDC_ST_OK; } -rdc_status_t rdc_diag_test_cases_query( - rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], - uint32_t* test_case_count) { - if (test_case_count == nullptr) { - return RDC_ST_BAD_PARAMETER; - } +rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } - *test_case_count = 2; - test_cases[0] = RDC_DIAG_COMPUTE_QUEUE; - test_cases[1] = RDC_DIAG_SYS_MEM_CHECK; + *test_case_count = 2; + test_cases[0] = RDC_DIAG_COMPUTE_QUEUE; + test_cases[1] = RDC_DIAG_SYS_MEM_CHECK; - return RDC_ST_OK; + return RDC_ST_OK; } // Helper function to run the memory test on GPU -static rdc_status_t run_memory_test(uint32_t gpu_index, - rdc_diag_test_result_t* result) { - std::string info = result->info; - std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg; +static rdc_status_t run_memory_test(uint32_t gpu_index, rdc_diag_test_result_t* result) { + std::string info = result->info; + std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg; - try { - amd::rdc::MemoryTest test(gpu_index); - test.MaxSingleAllocationTest(); + try { + amd::rdc::MemoryTest test(gpu_index); + test.MaxSingleAllocationTest(); - info += test.get_gpu_info(); - per_gpu_info += test.get_per_gpu_info(); - } catch (const amd::rdc::SkipException& e) { - result->status = RDC_DIAG_RESULT_SKIP; - per_gpu_info += "MaxSingleAllocationTest is skipped: "; - per_gpu_info += e.what(); - info += "GPU "; - info += std::to_string(gpu_index); - info += " MaxSingleAllocationTest is skipped: "; - info += e.what(); - info += "."; - } catch (const std::exception& e) { - result->status = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "MaxSingleAllocationTest returns with error "; - per_gpu_info += e.what(); - info += "GPU "; - info += std::to_string(gpu_index); - info += " MaxSingleAllocationTest returns with error "; - info += e.what(); - info += "."; - } + info += test.get_gpu_info(); + per_gpu_info += test.get_per_gpu_info(); + } catch (const amd::rdc::SkipException& e) { + result->status = RDC_DIAG_RESULT_SKIP; + per_gpu_info += "MaxSingleAllocationTest is skipped: "; + per_gpu_info += e.what(); + info += "GPU "; + info += std::to_string(gpu_index); + info += " MaxSingleAllocationTest is skipped: "; + info += e.what(); + info += "."; + } catch (const std::exception& e) { + result->status = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "MaxSingleAllocationTest returns with error "; + per_gpu_info += e.what(); + info += "GPU "; + info += std::to_string(gpu_index); + info += " MaxSingleAllocationTest returns with error "; + info += e.what(); + info += "."; + } - try { - amd::rdc::MemoryAccessTest test(gpu_index); - test.CPUAccessToGPUMemoryTest(); - test.GPUAccessToCPUMemoryTest(); - info += test.get_gpu_info(); - per_gpu_info += test.get_per_gpu_info(); - } catch (const amd::rdc::SkipException& e) { - result->status = RDC_DIAG_RESULT_SKIP; - per_gpu_info += "Memory Access is skipped: "; - per_gpu_info += e.what(); - info += "GPU "; - info += std::to_string(gpu_index); - info += " Memory Access is skipped: "; - info += e.what(); - info += "."; - } catch (const std::exception& e) { - result->status = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Memory Access returns with error "; - per_gpu_info += e.what(); - info += "GPU "; - info += std::to_string(gpu_index); - info += " Memory Access returns with error "; - info += e.what(); - info += "."; - } + try { + amd::rdc::MemoryAccessTest test(gpu_index); + test.CPUAccessToGPUMemoryTest(); + test.GPUAccessToCPUMemoryTest(); + info += test.get_gpu_info(); + per_gpu_info += test.get_per_gpu_info(); + } catch (const amd::rdc::SkipException& e) { + result->status = RDC_DIAG_RESULT_SKIP; + per_gpu_info += "Memory Access is skipped: "; + per_gpu_info += e.what(); + info += "GPU "; + info += std::to_string(gpu_index); + info += " Memory Access is skipped: "; + info += e.what(); + info += "."; + } catch (const std::exception& e) { + result->status = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Memory Access returns with error "; + per_gpu_info += e.what(); + info += "GPU "; + info += std::to_string(gpu_index); + info += " Memory Access returns with error "; + info += e.what(); + info += "."; + } - strncpy_with_null(result->info, info.c_str(), - MAX_DIAG_MSG_LENGTH); - strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg, - per_gpu_info.c_str(), MAX_DIAG_MSG_LENGTH); + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg, per_gpu_info.c_str(), + MAX_DIAG_MSG_LENGTH); - return RDC_ST_OK; + return RDC_ST_OK; } +static rdc_status_t run_compute_queue_test(uint32_t gpu_index, rdc_diag_test_result_t* result) { + std::string info = result->info; + std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg; -static rdc_status_t run_compute_queue_test(uint32_t gpu_index, - rdc_diag_test_result_t* result) { - std::string info = result->info; - std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg; + try { + amd::rdc::ComputeQueueTest test(gpu_index); + test.RunBinarySearchTest(); + info += test.get_gpu_info(); + per_gpu_info += test.get_per_gpu_info(); + } catch (const amd::rdc::SkipException& e) { + result->status = RDC_DIAG_RESULT_SKIP; + per_gpu_info += "Compute Queue test is skipped: "; + per_gpu_info += e.what(); + info += "GPU "; + info += std::to_string(gpu_index); + info += " Compute Queue test is skipped: "; + info += e.what(); + info += "."; + } catch (const std::exception& e) { + result->status = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Compute Queue test returns with error "; + per_gpu_info += e.what(); + info += "GPU "; + info += std::to_string(gpu_index); + info += " Compute Queue test returns with error "; + info += e.what(); + info += "."; + } - try { - amd::rdc::ComputeQueueTest test(gpu_index); - test.RunBinarySearchTest(); - info += test.get_gpu_info(); - per_gpu_info += test.get_per_gpu_info(); - } catch (const amd::rdc::SkipException& e) { + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg, per_gpu_info.c_str(), + MAX_DIAG_MSG_LENGTH); + + return RDC_ST_OK; +} + +rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr || gpu_count == 0) { + return RDC_ST_BAD_PARAMETER; + } + + if (test_case != RDC_DIAG_COMPUTE_QUEUE && test_case != RDC_DIAG_SYS_MEM_CHECK) { + return RDC_ST_OK; + } + + // init the return data + *result = {}; + result->test_case = test_case; + result->status = RDC_DIAG_RESULT_PASS; + result->per_gpu_result_count = 0; + + // Run test for each GPU. It will continue even + // if one GPU test is fail. + for (uint32_t i = 0; i < gpu_count; i++) { + switch (test_case) { + case RDC_DIAG_SYS_MEM_CHECK: + run_memory_test(gpu_index[i], result); + break; + case RDC_DIAG_COMPUTE_QUEUE: + run_compute_queue_test(gpu_index[i], result); + break; + default: result->status = RDC_DIAG_RESULT_SKIP; - per_gpu_info += "Compute Queue test is skipped: "; - per_gpu_info += e.what(); - info += "GPU "; - info += std::to_string(gpu_index); - info += " Compute Queue test is skipped: "; - info += e.what(); - info += "."; - } catch (const std::exception& e) { - result->status = RDC_DIAG_RESULT_FAIL; - per_gpu_info += "Compute Queue test returns with error "; - per_gpu_info += e.what(); - info += "GPU "; - info += std::to_string(gpu_index); - info += " Compute Queue test returns with error "; - info += e.what(); - info += "."; + strncpy_with_null(result->info, "Not support yet", MAX_DIAG_MSG_LENGTH); } + } - strncpy_with_null(result->info, info.c_str(), - MAX_DIAG_MSG_LENGTH); - strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg, - per_gpu_info.c_str(), MAX_DIAG_MSG_LENGTH); - - return RDC_ST_OK; -} - -rdc_status_t rdc_diag_test_case_run( - rdc_diag_test_cases_t test_case, - uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { - if (result == nullptr || - gpu_count == 0 ) { - return RDC_ST_BAD_PARAMETER; - } - - if (test_case != RDC_DIAG_COMPUTE_QUEUE && - test_case != RDC_DIAG_SYS_MEM_CHECK) { - return RDC_ST_OK; - } - - // init the return data - *result = {}; - result->test_case = test_case; - result->status = RDC_DIAG_RESULT_PASS; - result->per_gpu_result_count = 0; - - // Run test for each GPU. It will continue even - // if one GPU test is fail. - for (uint32_t i = 0; i < gpu_count; i++) { - switch (test_case) { - case RDC_DIAG_SYS_MEM_CHECK: - run_memory_test(gpu_index[i], result); - break; - case RDC_DIAG_COMPUTE_QUEUE: - run_compute_queue_test(gpu_index[i], result); - break; - default: - result->status = RDC_DIAG_RESULT_SKIP; - strncpy_with_null(result->info, "Not support yet" - , MAX_DIAG_MSG_LENGTH); - } - } - - return RDC_ST_OK; + return RDC_ST_OK; } diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcRocrBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcRocrBase.cc index 29dcc8b792..b6d3082d88 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcRocrBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcRocrBase.cc @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include "rdc_modules/rdc_rocr/RdcRocrBase.h" + #include namespace amd { @@ -45,8 +46,7 @@ RdcRocrBase::RdcRocrBase(void) { orig_hsa_enable_interrupt_ = nullptr; } -RdcRocrBase::~RdcRocrBase() { -} +RdcRocrBase::~RdcRocrBase() {} } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/TestBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/TestBase.cc old mode 100755 new mode 100644 index 6d4035e2c8..6f15642947 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/TestBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/TestBase.cc @@ -20,13 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_modules/rdc_rocr/TestBase.h" + #include #include + #include -#include "rdc_modules/rdc_rocr/TestBase.h" -#include "rdc_modules/rdc_rocr/base_rocr_utils.h" + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" +#include "rdc_modules/rdc_rocr/base_rocr_utils.h" namespace amd { namespace rdc { @@ -40,16 +43,10 @@ static const char kRunLabel[] = "TEST EXECUTION"; static const char kCloseLabel[] = "TEST CLEAN UP"; static const char kResultsLabel[] = "TEST RESULTS"; +TestBase::TestBase(uint32_t gpu_index) : gpu_index_(gpu_index), description_("") { SetUp(); } +TestBase::~TestBase() { Close(); } -TestBase::TestBase(uint32_t gpu_index): - gpu_index_(gpu_index), description_("") { - SetUp(); -} -TestBase::~TestBase() { - Close(); -} - -static void MakeHeaderStr(const char *inStr, std::string *outStr) { +static void MakeHeaderStr(const char* inStr, std::string* outStr) { assert(outStr != nullptr); assert(inStr != nullptr); @@ -88,7 +85,6 @@ void TestBase::Close(void) { throw_if_error(err); } - void TestBase::DisplayResults(void) const { std::string label; MakeHeaderStr(kResultsLabel, &label); @@ -96,8 +92,9 @@ void TestBase::DisplayResults(void) const { } void TestBase::DisplayTestInfo(void) { - printf("#########################################" - "######################################\n"); + printf( + "#########################################" + "######################################\n"); std::string label; MakeHeaderStr(kTitleLabel, &label); @@ -122,8 +119,7 @@ void TestBase::set_description(std::string d) { } } -hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index, - hsa_agent_t* agent) { +hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index, hsa_agent_t* agent) { hsa_status_t err = HSA_STATUS_SUCCESS; std::vector gpus; err = hsa_iterate_agents(IterateGPUAgents, &gpus); @@ -135,19 +131,16 @@ hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index, // sort based on bdf id std::vector> dv_to_id; for (uint32_t dv_ind = 0; dv_ind < gpus.size(); ++dv_ind) { - auto dev = gpus[dv_ind]; - uint16_t bdf_id = 0; - err = hsa_agent_get_info(dev, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id); - throw_if_error(err, "fail to get gpu bdfid"); - dv_to_id.push_back({bdf_id, dev}); + auto dev = gpus[dv_ind]; + uint16_t bdf_id = 0; + err = hsa_agent_get_info(dev, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id); + throw_if_error(err, "fail to get gpu bdfid"); + dv_to_id.push_back({bdf_id, dev}); } // Stable sort to keep the order if bdf is equal. - std::stable_sort(dv_to_id.begin(), dv_to_id.end(), [] - (const std::pair& p1, - const std::pair& p2) { - return p1.first < p2.first; - }); + std::stable_sort(dv_to_id.begin(), dv_to_id.end(), + [](const std::pair& p1, + const std::pair& p2) { return p1.first < p2.first; }); *agent = dv_to_id[gpu_index].second; @@ -156,4 +149,3 @@ hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index, } // namespace rdc } // namespace amd - diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/base_rocr_utils.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/base_rocr_utils.cc old mode 100755 new mode 100644 index d2e0b70d3c..b5e53c1188 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/base_rocr_utils.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/base_rocr_utils.cc @@ -20,16 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - #include "rdc_modules/rdc_rocr/base_rocr_utils.h" + #include #include +#include #include #include -#include #include -#include + #include +#include + #include "hsa/hsa.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" @@ -37,7 +39,6 @@ THE SOFTWARE. namespace amd { namespace rdc { - // Clean up some of the common handles and memory used by RdcRocrBase code, then // shut down hsa. Restore HSA_ENABLE_INTERRUPT to original value, if necessary hsa_status_t CommonCleanUp(RdcRocrBase* test) { @@ -78,7 +79,10 @@ hsa_status_t CommonCleanUp(RdcRocrBase* test) { return err; } -static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", }; +static const char* PROFILE_STR[] = { + "HSA_PROFILE_BASE", + "HSA_PROFILE_FULL", +}; /// Verify that the machine running the test has the required profile. /// This function will verify that the execution machine meets any specific @@ -89,18 +93,16 @@ static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", }; /// - false Machine does not meet test requirements bool CheckProfileAndInform(RdcRocrBase* test) { if (test->verbosity() > 0) { - RDC_LOG(RDC_DEBUG, "Target HW Profile is " - << PROFILE_STR[test->profile()]); + RDC_LOG(RDC_DEBUG, "Target HW Profile is " << PROFILE_STR[test->profile()]); } if (test->requires_profile() == -1) { if (test->verbosity() > 0) { - RDC_LOG(RDC_DEBUG, "Test can run on any profile. OK."); + RDC_LOG(RDC_DEBUG, "Test can run on any profile. OK."); } return true; } else { - RDC_LOG(RDC_DEBUG, "Test requires " << PROFILE_STR[test->requires_profile()] - << ". "); + RDC_LOG(RDC_DEBUG, "Test requires " << PROFILE_STR[test->requires_profile()] << ". "); if (test->requires_profile() != test->profile()) { RDC_LOG(RDC_DEBUG, "Not Running."); return false; @@ -133,29 +135,29 @@ static hsa_status_t ProcessIterateError(hsa_status_t err) { hsa_status_t SetPoolsTypical(RdcRocrBase* test) { hsa_status_t err; if (test->profile() == HSA_PROFILE_FULL) { - err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), - FindAPUStandardPool, &test->cpu_pool()); + err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindAPUStandardPool, + &test->cpu_pool()); throw_if_error(ProcessIterateError(err)); - err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), - FindAPUStandardPool, &test->device_pool()); + err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindAPUStandardPool, + &test->device_pool()); throw_if_error(ProcessIterateError(err)); - err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), - FindAPUStandardPool, &test->kern_arg_pool()); + err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindAPUStandardPool, + &test->kern_arg_pool()); throw_if_error(ProcessIterateError(err)); } else { - err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), - FindStandardPool, &test->cpu_pool()); + err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindStandardPool, + &test->cpu_pool()); throw_if_error(ProcessIterateError(err)); - err = hsa_amd_agent_iterate_memory_pools(*test->gpu_device1(), - FindStandardPool, &test->device_pool()); + err = hsa_amd_agent_iterate_memory_pools(*test->gpu_device1(), FindStandardPool, + &test->device_pool()); throw_if_error(ProcessIterateError(err)); - err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), - FindKernArgPool, &test->kern_arg_pool()); + err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindKernArgPool, + &test->kern_arg_pool()); throw_if_error(ProcessIterateError(err)); } @@ -261,11 +263,11 @@ hsa_status_t LoadKernelFromObjFile(RdcRocrBase* test, hsa_agent_t* agent) { } std::string kern_name = test->kernel_name(); - std::string obj_file = search_hsaco_full_path( - test->kernel_file_name().c_str(), test->get_agent_name().c_str()); + std::string obj_file = + search_hsaco_full_path(test->kernel_file_name().c_str(), test->get_agent_name().c_str()); if (obj_file == "") { - RDC_LOG(RDC_ERROR, "failed to find " << test->kernel_file_name() << - " at line " << __LINE__ << ", errno: " << errno); + RDC_LOG(RDC_ERROR, "failed to find " << test->kernel_file_name() << " at line " << __LINE__ + << ", errno: " << errno); std::string msg("fail to open "); msg += test->kernel_file_name(); throw_if_skip(msg); @@ -275,55 +277,53 @@ hsa_status_t LoadKernelFromObjFile(RdcRocrBase* test, hsa_agent_t* agent) { hsa_file_t file_handle = open(obj_file.c_str(), O_RDONLY); if (file_handle == -1) { - RDC_LOG(RDC_ERROR, "failed to open " << obj_file.c_str() << " at line " - << __LINE__ << ", file: " << __FILE__); - return (hsa_status_t) errno; + RDC_LOG(RDC_ERROR, "failed to open " << obj_file.c_str() << " at line " << __LINE__ + << ", file: " << __FILE__); + return (hsa_status_t)errno; } err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); throw_if_error(err); close(file_handle); - err = hsa_executable_create_alt(HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - NULL, &executable); + err = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, + &executable); throw_if_error(err); - err = hsa_executable_load_agent_code_object(executable, *agent, code_obj_rdr, - NULL, NULL); + err = hsa_executable_load_agent_code_object(executable, *agent, code_obj_rdr, NULL, NULL); throw_if_error(err); err = hsa_executable_freeze(executable, NULL); throw_if_error(err); hsa_executable_symbol_t kern_sym; - err = hsa_executable_get_symbol(executable, NULL, (kern_name + ".kd").c_str(), *agent, - 0, &kern_sym); + err = hsa_executable_get_symbol(executable, NULL, (kern_name + ".kd").c_str(), *agent, 0, + &kern_sym); throw_if_error(err); uint64_t codeHandle; - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &codeHandle); + err = hsa_executable_symbol_get_info(kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &codeHandle); throw_if_error(err); test->set_kernel_object(codeHandle); uint32_t val; - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &val); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &val); throw_if_error(err); test->set_private_segment_size(val); err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &val); + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &val); throw_if_error(err); test->set_group_segment_size(val); // Remaining queries only supported on code object v3. - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &val); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &val); throw_if_error(err); test->set_kernarg_size(val); - err = hsa_executable_symbol_get_info(kern_sym, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, &val); + err = hsa_executable_symbol_get_info( + kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, &val); throw_if_error(err); assert(val >= 16 && "Reported kernarg size is too small."); val = (val == 0) ? 16 : val; @@ -332,26 +332,23 @@ hsa_status_t LoadKernelFromObjFile(RdcRocrBase* test, hsa_agent_t* agent) { return HSA_STATUS_SUCCESS; } -hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, - uint32_t num_pkts) { +hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, uint32_t num_pkts) { hsa_status_t err; if (num_pkts == 0) { - err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &num_pkts); + err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &num_pkts); throw_if_error(err); } - err = hsa_queue_create(device, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, - NULL, UINT32_MAX, UINT32_MAX, queue); + err = hsa_queue_create(device, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, + queue); throw_if_error(err); return HSA_STATUS_SUCCESS; } // Initialize the provided aql packet with standard default values, and // values from provided RdcRocrBase object. -hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, - hsa_kernel_dispatch_packet_t* aql) { +hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, hsa_kernel_dispatch_packet_t* aql) { hsa_status_t err; assert(aql != nullptr); @@ -359,7 +356,7 @@ hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, if (aql == nullptr) { return HSA_STATUS_ERROR; } - + // Initialize Packet type as Invalid // Update packet type to Kernel Dispatch // right before ringing doorbell @@ -370,7 +367,7 @@ hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, aql->workgroup_size_y = 1; aql->workgroup_size_z = 1; - aql->grid_size_x = (uint64_t) 256; // manual_input*group_input; workg max sz + aql->grid_size_x = (uint64_t)256; // manual_input*group_input; workg max sz aql->grid_size_y = 1; aql->grid_size_z = 1; @@ -392,11 +389,11 @@ hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, // Copy RdcRocrBase aql object values to the RdcRocrBase object queue in the // specified queue position (ind) -hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind) { +hsa_kernel_dispatch_packet_t* WriteAQLToQueue(RdcRocrBase* test, uint64_t* ind) { assert(test); assert(test->main_queue()); - void *queue_base = test->main_queue()->base_address; + void* queue_base = test->main_queue()->base_address; const uint32_t queue_mask = test->main_queue()->size - 1; uint64_t que_idx = hsa_queue_add_write_index_relaxed(test->main_queue(), 1); *ind = que_idx; @@ -405,8 +402,7 @@ hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind) hsa_kernel_dispatch_packet_t* queue_aql_packet; queue_aql_packet = - &(reinterpret_cast(queue_base)) - [que_idx & queue_mask]; + &(reinterpret_cast(queue_base))[que_idx & queue_mask]; queue_aql_packet->workgroup_size_x = staging_aql_packet->workgroup_size_x; queue_aql_packet->workgroup_size_y = staging_aql_packet->workgroup_size_y; @@ -414,10 +410,8 @@ hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind) queue_aql_packet->grid_size_x = staging_aql_packet->grid_size_x; queue_aql_packet->grid_size_y = staging_aql_packet->grid_size_y; queue_aql_packet->grid_size_z = staging_aql_packet->grid_size_z; - queue_aql_packet->private_segment_size = - staging_aql_packet->private_segment_size; - queue_aql_packet->group_segment_size = - staging_aql_packet->group_segment_size; + queue_aql_packet->private_segment_size = staging_aql_packet->private_segment_size; + queue_aql_packet->group_segment_size = staging_aql_packet->group_segment_size; queue_aql_packet->kernel_object = staging_aql_packet->kernel_object; queue_aql_packet->kernarg_address = staging_aql_packet->kernarg_address; queue_aql_packet->completion_signal = staging_aql_packet->completion_signal; @@ -425,19 +419,16 @@ hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind) return queue_aql_packet; } -void -WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx, - hsa_kernel_dispatch_packet_t *aql_pkt) { +void WriteAQLToQueueLoc(hsa_queue_t* queue, uint64_t indx, hsa_kernel_dispatch_packet_t* aql_pkt) { assert(queue); assert(aql_pkt); - void *queue_base = queue->base_address; + void* queue_base = queue->base_address; const uint32_t queue_mask = queue->size - 1; hsa_kernel_dispatch_packet_t* queue_aql_packet; queue_aql_packet = - &(reinterpret_cast(queue_base)) - [indx & queue_mask]; + &(reinterpret_cast(queue_base))[indx & queue_mask]; queue_aql_packet->workgroup_size_x = aql_pkt->workgroup_size_x; queue_aql_packet->workgroup_size_y = aql_pkt->workgroup_size_y; @@ -445,10 +436,8 @@ WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx, queue_aql_packet->grid_size_x = aql_pkt->grid_size_x; queue_aql_packet->grid_size_y = aql_pkt->grid_size_y; queue_aql_packet->grid_size_z = aql_pkt->grid_size_z; - queue_aql_packet->private_segment_size = - aql_pkt->private_segment_size; - queue_aql_packet->group_segment_size = - aql_pkt->group_segment_size; + queue_aql_packet->private_segment_size = aql_pkt->private_segment_size; + queue_aql_packet->group_segment_size = aql_pkt->group_segment_size; queue_aql_packet->kernel_object = aql_pkt->kernel_object; queue_aql_packet->kernarg_address = aql_pkt->kernarg_address; queue_aql_packet->completion_signal = aql_pkt->completion_signal; @@ -474,11 +463,10 @@ hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, size_t arg_size) test->set_kernarg_buffer(kern_arg_buf); - void *adj_kern_arg_buf = AlignUp(kern_arg_buf, req_align); + void* adj_kern_arg_buf = AlignUp(kern_arg_buf, req_align); assert(arg_size >= test->kernarg_size()); - assert(((uintptr_t)adj_kern_arg_buf + arg_size) < - ((uintptr_t)kern_arg_buf + buf_size)); + assert(((uintptr_t)adj_kern_arg_buf + arg_size) < ((uintptr_t)kern_arg_buf + buf_size)); hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()}; err = hsa_amd_agents_allow_access(2, ag_list, NULL, kern_arg_buf); @@ -494,28 +482,27 @@ hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, size_t arg_size) std::string get_lib_dir(const char* lib_name) { std::string result; - char line[1024*8]; + char line[1024 * 8]; FILE* file = fopen("/proc/self/maps", "r"); - if (file == NULL) - return result; + if (file == NULL) return result; std::string lib_path = "/"; lib_path += lib_name; // 7f4eacb46000 r-xp 00000 08:01 17183106 /lib/x86_64-linux-gnu/libc-2.27.so while (fgets(line, sizeof(line), file)) { - char* end = strstr(line, lib_path.c_str()); - if (end != NULL) { - char* start = end; - while (start > line) { - if (isspace(*start)) { - start++; - break; - } - start--; - } - result = std::string(start, end-start); - break; + char* end = strstr(line, lib_path.c_str()); + if (end != NULL) { + char* start = end; + while (start > line) { + if (isspace(*start)) { + start++; + break; + } + start--; } + result = std::string(start, end - start); + break; + } } fclose(file); @@ -523,41 +510,37 @@ std::string get_lib_dir(const char* lib_name) { } std::string get_app_dir() { - char buf[1024*8]; - int ret = readlink("/proc/self/exe", buf, 1024*8); - if ((ret != -1) && ret < (1024*8 - 1)) { + char buf[1024 * 8]; + int ret = readlink("/proc/self/exe", buf, 1024 * 8); + if ((ret != -1) && ret < (1024 * 8 - 1)) { buf[ret] = '\0'; return dirname(buf); } return ""; } -std::string search_hsaco_full_path(const char* hsaco_file_name, - const char* agent_name) { +std::string search_hsaco_full_path(const char* hsaco_file_name, const char* agent_name) { const std::string lib_dir = get_lib_dir("librdc_rocr.so"); const std::string app_dir = get_app_dir(); std::vector path_to_search; - path_to_search.push_back(std::string("./")+hsaco_file_name); - path_to_search.push_back(app_dir+"/"+hsaco_file_name); - path_to_search.push_back(lib_dir+"/"+hsaco_file_name); - path_to_search.push_back(lib_dir+"/rdc/hsaco/"+ agent_name - + "/" + hsaco_file_name); - path_to_search.push_back(lib_dir+"/hsaco/"+ agent_name - + "/" + hsaco_file_name); + path_to_search.push_back(std::string("./") + hsaco_file_name); + path_to_search.push_back(app_dir + "/" + hsaco_file_name); + path_to_search.push_back(lib_dir + "/" + hsaco_file_name); + path_to_search.push_back(lib_dir + "/rdc/hsaco/" + agent_name + "/" + hsaco_file_name); + path_to_search.push_back(lib_dir + "/hsaco/" + agent_name + "/" + hsaco_file_name); // for dev structure - path_to_search.push_back(lib_dir+"/../../rdc_libs/rdc_modules/kernels/hsaco/" - + agent_name + "/" + hsaco_file_name); + path_to_search.push_back(lib_dir + "/../../rdc_libs/rdc_modules/kernels/hsaco/" + agent_name + + "/" + hsaco_file_name); for (std::size_t i = 0; i < path_to_search.size(); i++) { - if ( ::access(path_to_search[i].c_str(), F_OK) == 0 ) { - RDC_LOG(RDC_DEBUG, "Use the file " << path_to_search[i]); - return path_to_search[i]; - } - RDC_LOG(RDC_DEBUG, "Skip not exists file " << path_to_search[i]); + if (::access(path_to_search[i].c_str(), F_OK) == 0) { + RDC_LOG(RDC_DEBUG, "Use the file " << path_to_search[i]); + return path_to_search[i]; + } + RDC_LOG(RDC_DEBUG, "Skip not exists file " << path_to_search[i]); } return ""; } - } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/common.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/common.cc old mode 100755 new mode 100644 index 634702a8ae..9586353735 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/common.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/common.cc @@ -23,11 +23,14 @@ THE SOFTWARE. /// \file /// Implementation of utility functions used by RocR applications #include "rdc_modules/rdc_rocr/common.h" + #include #include + +#include #include #include -#include + #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" @@ -38,14 +41,11 @@ void throw_if_error(hsa_status_t err, const std::string& msg) { if (err != HSA_STATUS_SUCCESS) { const char* errstr = 0; hsa_status_string(err, &errstr); - throw std::runtime_error(msg + " hsa error code: " - + std::to_string(err) + " " + errstr); + throw std::runtime_error(msg + " hsa error code: " + std::to_string(err) + " " + errstr); } } -void throw_if_skip(const std::string& msg) { - throw SkipException(msg.c_str()); -} +void throw_if_skip(const std::string& msg) { throw SkipException(msg.c_str()); } void SetEnv(const char* env_var_name, const char* env_var_value) { int err = setenv(env_var_name, env_var_value, 1); @@ -56,28 +56,21 @@ void SetEnv(const char* env_var_name, const char* env_var_value) { } } -intptr_t -AlignDown(intptr_t value, size_t alignment) { - assert(alignment != 0 && "Zero alignment"); - return (intptr_t) (value & ~(alignment - 1)); +intptr_t AlignDown(intptr_t value, size_t alignment) { + assert(alignment != 0 && "Zero alignment"); + return (intptr_t)(value & ~(alignment - 1)); } -void * -AlignDown(void* value, size_t alignment) { - return reinterpret_cast(AlignDown( - reinterpret_cast(value), alignment)); +void* AlignDown(void* value, size_t alignment) { + return reinterpret_cast(AlignDown(reinterpret_cast(value), alignment)); } -void * -AlignUp(void* value, size_t alignment) { - return reinterpret_cast( - AlignDown((uintptr_t)(reinterpret_cast(value) + alignment - 1), - alignment)); +void* AlignUp(void* value, size_t alignment) { + return reinterpret_cast( + AlignDown((uintptr_t)(reinterpret_cast(value) + alignment - 1), alignment)); } - -static hsa_status_t FindAgent(hsa_agent_t agent, void* data, - hsa_device_type_t dev_type) { +static hsa_status_t FindAgent(hsa_agent_t agent, void* data, hsa_device_type_t dev_type) { assert(data != nullptr); if (data == nullptr) { @@ -85,8 +78,7 @@ static hsa_status_t FindAgent(hsa_agent_t agent, void* data, } hsa_device_type_t hsa_device_type; - hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, - &hsa_device_type); + hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); throw_if_error(hsa_error_code); if (hsa_device_type == dev_type) { @@ -98,7 +90,7 @@ static hsa_status_t FindAgent(hsa_agent_t agent, void* data, } // Find CPU Agents -hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data) { +hsa_status_t IterateCPUAgents(hsa_agent_t agent, void* data) { hsa_status_t status; assert(data != nullptr); if (data == nullptr) { @@ -115,10 +107,8 @@ hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data) { return status; } - - // Find GPU Agents -hsa_status_t IterateGPUAgents(hsa_agent_t agent, void *data) { +hsa_status_t IterateGPUAgents(hsa_agent_t agent, void* data) { hsa_status_t status; assert(data != nullptr); if (data == nullptr) { @@ -138,27 +128,20 @@ hsa_status_t IterateGPUAgents(hsa_agent_t agent, void *data) { hsa_status_t GetGlobalMemoryPool(hsa_amd_memory_pool_t pool, void* data) { hsa_amd_segment_t segment; hsa_status_t err; - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_SEGMENT, - &segment); - if (HSA_AMD_SEGMENT_GLOBAL != segment) - return err; + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + if (HSA_AMD_SEGMENT_GLOBAL != segment) return err; hsa_amd_memory_pool_global_flag_t flags; - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, - &flags); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); throw_if_error(err); // this is valid for dGPUs. But on APUs, it has to be FINE_GRAINED if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { - hsa_amd_memory_pool_t* ret = - reinterpret_cast(data); + hsa_amd_memory_pool_t* ret = reinterpret_cast(data); *ret = pool; } else { // this is for APUs if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { - hsa_amd_memory_pool_t* ret = - reinterpret_cast(data); + hsa_amd_memory_pool_t* ret = reinterpret_cast(data); *ret = pool; } } @@ -172,23 +155,18 @@ hsa_status_t GetKernArgMemoryPool(hsa_amd_memory_pool_t pool, void* data) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } hsa_amd_segment_t segment; - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_SEGMENT, - &segment); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); throw_if_error(err); if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } hsa_amd_memory_pool_global_flag_t flags; - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, - &flags); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); throw_if_error(err); if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { - hsa_amd_memory_pool_t* ret = - reinterpret_cast(data); + hsa_amd_memory_pool_t* ret = reinterpret_cast(data); *ret = pool; } @@ -211,10 +189,9 @@ typedef enum { POOL_PROP_DONT_CARE ///< We don't care if the property is present or not. } pool_prop_t; -static hsa_status_t -FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment, - pool_prop_t accessible_by_all, pool_prop_t kern_arg, - pool_prop_t fine_grain) { +static hsa_status_t FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment, + pool_prop_t accessible_by_all, pool_prop_t kern_arg, + pool_prop_t fine_grain) { if (nullptr == data) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -223,8 +200,7 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment, hsa_amd_segment_t segment; uint32_t flag; - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, - &segment); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); throw_if_error(err); if (in_segment != segment) { @@ -232,8 +208,7 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment, } if (HSA_AMD_SEGMENT_GLOBAL == in_segment) { - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); throw_if_error(err); if (kern_arg != POOL_PROP_DONT_CARE) { @@ -254,13 +229,12 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment, if (accessible_by_all != POOL_PROP_DONT_CARE) { bool access_read; - err = hsa_amd_memory_pool_get_info(pool, - (hsa_amd_memory_pool_info_t) - HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_read); + err = hsa_amd_memory_pool_get_info( + pool, (hsa_amd_memory_pool_info_t)HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_read); throw_if_error(err); if (((!access_read) && accessible_by_all == POOL_PROP_ON) || - (access_read && (accessible_by_all == POOL_PROP_OFF))) { + (access_read && (accessible_by_all == POOL_PROP_OFF))) { return HSA_STATUS_SUCCESS; } } @@ -270,69 +244,64 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment, } hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { - return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, - POOL_PROP_OFF, POOL_PROP_DONT_CARE); + return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, POOL_PROP_OFF, + POOL_PROP_DONT_CARE); } hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { - return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, - POOL_PROP_ON, POOL_PROP_DONT_CARE); + return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, POOL_PROP_ON, + POOL_PROP_DONT_CARE); } hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data) { - return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_ON, - POOL_PROP_OFF, POOL_PROP_DONT_CARE); + return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_ON, POOL_PROP_OFF, + POOL_PROP_DONT_CARE); } hsa_status_t FindAPUStandardPool(hsa_amd_memory_pool_t pool, void* data) { - return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, - POOL_PROP_DONT_CARE, POOL_PROP_DONT_CARE); + return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, POOL_PROP_DONT_CARE, + POOL_PROP_DONT_CARE); } // Populate the vector with handles to all agents and pools -hsa_status_t -GetAgentPools(std::vector> *agent_pools) { +hsa_status_t GetAgentPools(std::vector>* agent_pools) { hsa_status_t err; assert(agent_pools != nullptr); - auto save_agent = [](hsa_agent_t a, void *data)->hsa_status_t { - std::vector> *ag_vec; + auto save_agent = [](hsa_agent_t a, void* data) -> hsa_status_t { + std::vector>* ag_vec; hsa_status_t err; assert(data != nullptr); - ag_vec = - reinterpret_cast> *>(data); + ag_vec = reinterpret_cast>*>(data); std::shared_ptr ag(new agent_pools_t); ag->agent = a; - - auto save_pool = [](hsa_amd_memory_pool_t p, void *data)->hsa_status_t { + auto save_pool = [](hsa_amd_memory_pool_t p, void* data) -> hsa_status_t { assert(data != nullptr); - std::vector *p_list = - reinterpret_cast *>(data); + std::vector* p_list = + reinterpret_cast*>(data); p_list->push_back(p); return HSA_STATUS_SUCCESS; }; - err = hsa_amd_agent_iterate_memory_pools(a, save_pool, - reinterpret_cast(&ag->pools)); + err = hsa_amd_agent_iterate_memory_pools(a, save_pool, reinterpret_cast(&ag->pools)); ag_vec->push_back(ag); return err; }; - err = hsa_iterate_agents(save_agent, reinterpret_cast(agent_pools)); + err = hsa_iterate_agents(save_agent, reinterpret_cast(agent_pools)); return err; } -static hsa_status_t MakeGlobalFlagsString(const pool_info_t *pool_i, - std::string* out_str) { +static hsa_status_t MakeGlobalFlagsString(const pool_info_t* pool_i, std::string* out_str) { uint32_t global_flag = pool_i->global_flag; assert(out_str != nullptr); *out_str = ""; - std::vector < std::string > flags; + std::vector flags; if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) { flags.push_back("KERNARG"); @@ -356,8 +325,7 @@ static hsa_status_t MakeGlobalFlagsString(const pool_info_t *pool_i, return HSA_STATUS_SUCCESS; } -static hsa_status_t DumpSegment(const pool_info_t *pool_i, - std::string const *ind_lvl) { +static hsa_status_t DumpSegment(const pool_info_t* pool_i, std::string const* ind_lvl) { hsa_status_t err; RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Segment:"); @@ -394,53 +362,44 @@ static hsa_status_t DumpSegment(const pool_info_t *pool_i, return HSA_STATUS_SUCCESS; } -hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, - pool_info_t *pool_i) { +hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, pool_info_t* pool_i) { hsa_status_t err; - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &pool_i->global_flag); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, + &pool_i->global_flag); throw_if_error(err); - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, - &pool_i->segment); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &pool_i->segment); throw_if_error(err); // Get the size of the POOL - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &pool_i->size); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &pool_i->size); throw_if_error(err); - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, - &pool_i->alloc_allowed); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + &pool_i->alloc_allowed); throw_if_error(err); - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, - &pool_i->alloc_granule); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, + &pool_i->alloc_granule); throw_if_error(err); - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, - &pool_i->alloc_alignment); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, + &pool_i->alloc_alignment); throw_if_error(err); - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, - &pool_i->accessible_by_all); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, + &pool_i->accessible_by_all); throw_if_error(err); - err = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE, - &pool_i->aggregate_alloc_max); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE, + &pool_i->aggregate_alloc_max); throw_if_error(err); return HSA_STATUS_SUCCESS; } -hsa_status_t DumpMemoryPoolInfo(const pool_info_t *pool_i, - uint32_t indent) { +hsa_status_t DumpMemoryPoolInfo(const pool_info_t* pool_i, uint32_t indent) { std::string ind_lvl(indent, ' '); DumpSegment(pool_i, &ind_lvl); @@ -448,31 +407,25 @@ hsa_status_t DumpMemoryPoolInfo(const pool_info_t *pool_i, std::string sz_str = std::to_string(pool_i->size / 1024) + "KB"; RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Size:" << sz_str); - RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Allocatable:" - << (pool_i->alloc_allowed ? "TRUE" : "FALSE")); + RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Allocatable:" << (pool_i->alloc_allowed ? "TRUE" : "FALSE")); std::string gr_str = std::to_string(pool_i->alloc_granule / 1024) + "KB"; RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Alloc Granule:" << gr_str); - std::string al_str = - std::to_string(pool_i->alloc_alignment / 1024) + "KB"; + std::string al_str = std::to_string(pool_i->alloc_alignment / 1024) + "KB"; RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Alloc Alignment:" << al_str); - RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Acessible by all:" << - (pool_i->accessible_by_all ? "TRUE" : "FALSE")); + RDC_LOG(RDC_DEBUG, + ind_lvl << " Pool Acessible by all:" << (pool_i->accessible_by_all ? "TRUE" : "FALSE")); - std::string agg_str = - std::to_string(pool_i->aggregate_alloc_max / 1024) + "KB"; + std::string agg_str = std::to_string(pool_i->aggregate_alloc_max / 1024) + "KB"; RDC_LOG(RDC_DEBUG, ind_lvl << "Pool Aggregate Alloc Size:" << agg_str); return HSA_STATUS_SUCCESS; } -static const char* Types[] = {"HSA_EXT_POINTER_TYPE_UNKNOWN", - "HSA_EXT_POINTER_TYPE_HSA", - "HSA_EXT_POINTER_TYPE_LOCKED", - "HSA_EXT_POINTER_TYPE_GRAPHICS", - "HSA_EXT_POINTER_TYPE_IPC" - }; +static const char* Types[] = {"HSA_EXT_POINTER_TYPE_UNKNOWN", "HSA_EXT_POINTER_TYPE_HSA", + "HSA_EXT_POINTER_TYPE_LOCKED", "HSA_EXT_POINTER_TYPE_GRAPHICS", + "HSA_EXT_POINTER_TYPE_IPC"}; hsa_status_t DumpPointerInfo(void* ptr) { hsa_amd_pointer_info_t info; @@ -484,14 +437,11 @@ hsa_status_t DumpPointerInfo(void* ptr) { throw_if_error(err); std::cout << "Info for ptr: " << ptr << std::endl; - std::cout << "CPU ptr: " << reinterpret_cast(info.hostBaseAddress) << - std::endl; - std::cout << "GPU ptr: " << reinterpret_cast(info.agentBaseAddress) - << std::endl; + std::cout << "CPU ptr: " << reinterpret_cast(info.hostBaseAddress) << std::endl; + std::cout << "GPU ptr: " << reinterpret_cast(info.agentBaseAddress) << std::endl; std::cout << "Size: " << info.sizeInBytes << std::endl; std::cout << "Type: " << Types[info.type] << std::endl; - std::cout << "UsrPtr " << reinterpret_cast(info.userData) << - std::endl; + std::cout << "UsrPtr " << reinterpret_cast(info.userData) << std::endl; std::cout << "Accessible by: "; for (uint32_t i = 0; i < count; i++) { @@ -503,7 +453,6 @@ hsa_status_t DumpPointerInfo(void* ptr) { return HSA_STATUS_SUCCESS; } - /*! \brief Writes to the buffer and increments the write pointer to the * buffer. Also, ensures that the argument is written to an * aligned memory as specified. Return the new write pointer. diff --git a/projects/rdc/rdci/include/RdciDiagSubSystem.h b/projects/rdc/rdci/include/RdciDiagSubSystem.h index 9a884c4d94..0ca325a7c1 100644 --- a/projects/rdc/rdci/include/RdciDiagSubSystem.h +++ b/projects/rdc/rdci/include/RdciDiagSubSystem.h @@ -22,40 +22,39 @@ THE SOFTWARE. #ifndef RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_ #define RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_ #include + #include -#include #include +#include + #include "RdciSubSystem.h" namespace amd { namespace rdc { -class RdciDiagSubSystem: public RdciSubSystem { +class RdciDiagSubSystem : public RdciSubSystem { public: - RdciDiagSubSystem(); - ~RdciDiagSubSystem(); - void parse_cmd_opts(int argc, char ** argv) override; - void process() override; + RdciDiagSubSystem(); + ~RdciDiagSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; private: - void show_help() const; + void show_help() const; - std::string get_test_name( - rdc_diag_test_cases_t test_case) const; + std::string get_test_name(rdc_diag_test_cases_t test_case) const; - enum OPERATIONS { - DIAG_UNKNOWN = 0, - DIAG_HELP, - DIAG_RUN, - } diag_ops_; + enum OPERATIONS { + DIAG_UNKNOWN = 0, + DIAG_HELP, + DIAG_RUN, + } diag_ops_; - rdc_gpu_group_t group_id_; - rdc_diag_level_t run_level_; + rdc_gpu_group_t group_id_; + rdc_diag_level_t run_level_; }; - } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciDiscoverySubSystem.h b/projects/rdc/rdci/include/RdciDiscoverySubSystem.h index 15b6089532..b0851e7ac9 100644 --- a/projects/rdc/rdci/include/RdciDiscoverySubSystem.h +++ b/projects/rdc/rdci/include/RdciDiscoverySubSystem.h @@ -27,19 +27,18 @@ THE SOFTWARE. namespace amd { namespace rdc { -class RdciDiscoverySubSystem: public RdciSubSystem { +class RdciDiscoverySubSystem : public RdciSubSystem { public: - RdciDiscoverySubSystem(); - void parse_cmd_opts(int argc, char ** argv) override; - void process() override; - private: - bool show_help_; - void show_help() const; -}; + RdciDiscoverySubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; + private: + bool show_help_; + void show_help() const; +}; } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCIDISCOVERYSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciDmonSubSystem.h b/projects/rdc/rdci/include/RdciDmonSubSystem.h index aa2ad3f3b1..5870c8d9f3 100644 --- a/projects/rdc/rdci/include/RdciDmonSubSystem.h +++ b/projects/rdc/rdci/include/RdciDmonSubSystem.h @@ -22,58 +22,57 @@ THE SOFTWARE. #ifndef RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_ #define RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_ #include + #include #include -#include "RdciSubSystem.h" +#include "RdciSubSystem.h" namespace amd { namespace rdc { -class RdciDmonSubSystem: public RdciSubSystem { +class RdciDmonSubSystem : public RdciSubSystem { public: - RdciDmonSubSystem(); - ~RdciDmonSubSystem(); - void parse_cmd_opts(int argc, char ** argv) override; - void process() override; + RdciDmonSubSystem(); + ~RdciDmonSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; private: - void show_help() const; - void show_field_usage() const; - void clean_up(); + void show_help() const; + void show_field_usage() const; + void clean_up(); - void create_temp_group(); - void create_temp_field_group(); + void create_temp_group(); + void create_temp_field_group(); - enum OPERATIONS { - DMON_UNKNOWN = 0, - DMON_HELP, - DMON_LIST_FIELDS, - DMON_LIST_ALL_FIELDS, - DMON_MONITOR - } dmon_ops_; + enum OPERATIONS { + DMON_UNKNOWN = 0, + DMON_HELP, + DMON_LIST_FIELDS, + DMON_LIST_ALL_FIELDS, + DMON_MONITOR + } dmon_ops_; - enum OPTIONS { - OPTIONS_UNKNOWN = 0, - OPTIONS_COUNT, - OPTIONS_DELAY, - OPTIONS_FIELD_GROUP_ID, - OPTIONS_GROUP_ID - }; + enum OPTIONS { + OPTIONS_UNKNOWN = 0, + OPTIONS_COUNT, + OPTIONS_DELAY, + OPTIONS_FIELD_GROUP_ID, + OPTIONS_GROUP_ID + }; - std::map options_; - std::vector field_ids_; - std::vector gpu_indexes_; - bool need_cleanup_; - uint64_t latest_time_stamp_; - bool show_timpstamps_; - static volatile sig_atomic_t is_terminating_; - static void set_terminating(int sig); + std::map options_; + std::vector field_ids_; + std::vector gpu_indexes_; + bool need_cleanup_; + uint64_t latest_time_stamp_; + bool show_timpstamps_; + static volatile sig_atomic_t is_terminating_; + static void set_terminating(int sig); }; - } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h b/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h index 05c264c2a3..58a355cf2d 100644 --- a/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h +++ b/projects/rdc/rdci/include/RdciFieldGroupSubSystem.h @@ -23,37 +23,37 @@ THE SOFTWARE. #define RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_ #include + #include "RdciSubSystem.h" namespace amd { namespace rdc { -class RdciFieldGroupSubSystem: public RdciSubSystem { +class RdciFieldGroupSubSystem : public RdciSubSystem { public: - RdciFieldGroupSubSystem(); - void parse_cmd_opts(int argc, char ** argv) override; - void process() override; + RdciFieldGroupSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; + private: - void show_help() const; + void show_help() const; - enum OPERATIONS { - FIELD_GROUP_UNKNOWN = 0, - FIELD_GROUP_HELP, - FIELD_GROUP_CREATE, - FIELD_GROUP_DELETE, - FIELD_GROUP_LIST, - FIELD_GROUP_INFO - } field_group_ops_; + enum OPERATIONS { + FIELD_GROUP_UNKNOWN = 0, + FIELD_GROUP_HELP, + FIELD_GROUP_CREATE, + FIELD_GROUP_DELETE, + FIELD_GROUP_LIST, + FIELD_GROUP_INFO + } field_group_ops_; - bool is_group_set_; - uint32_t group_id_; - std::string group_name_; - std::string field_ids_; + bool is_group_set_; + uint32_t group_id_; + std::string group_name_; + std::string field_ids_; }; - } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciGroupSubSystem.h b/projects/rdc/rdci/include/RdciGroupSubSystem.h index 2db13558eb..268cc7860a 100644 --- a/projects/rdc/rdci/include/RdciGroupSubSystem.h +++ b/projects/rdc/rdci/include/RdciGroupSubSystem.h @@ -24,39 +24,38 @@ THE SOFTWARE. #include #include + #include "RdciSubSystem.h" namespace amd { namespace rdc { -class RdciGroupSubSystem: public RdciSubSystem { +class RdciGroupSubSystem : public RdciSubSystem { public: - RdciGroupSubSystem(); - void parse_cmd_opts(int argc, char ** argv) override; - void process() override; + RdciGroupSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; private: - void show_help() const; + void show_help() const; - enum OPERATIONS { - GROUP_UNKNOWN = 0, - GROUP_HELP, - GROUP_CREATE, - GROUP_DELETE, - GROUP_LIST, - GROUP_ADD_GPUS, - GROUP_INFO - } group_ops_; + enum OPERATIONS { + GROUP_UNKNOWN = 0, + GROUP_HELP, + GROUP_CREATE, + GROUP_DELETE, + GROUP_LIST, + GROUP_ADD_GPUS, + GROUP_INFO + } group_ops_; - bool is_group_set_; - uint32_t group_id_; - std::string group_name_; - std::string gpu_ids_; + bool is_group_set_; + uint32_t group_id_; + std::string group_name_; + std::string gpu_ids_; }; - } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCIGROUPSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciStatsSubSystem.h b/projects/rdc/rdci/include/RdciStatsSubSystem.h index 116f3211ef..c7f35ccdbd 100644 --- a/projects/rdc/rdci/include/RdciStatsSubSystem.h +++ b/projects/rdc/rdci/include/RdciStatsSubSystem.h @@ -22,44 +22,42 @@ THE SOFTWARE. #ifndef RDCI_INCLUDE_RDCISTATSSUBSYSTEM_H_ #define RDCI_INCLUDE_RDCISTATSSUBSYSTEM_H_ #include -#include -#include "RdciSubSystem.h" +#include + +#include "RdciSubSystem.h" namespace amd { namespace rdc { -class RdciStatsSubSystem: public RdciSubSystem { +class RdciStatsSubSystem : public RdciSubSystem { public: - RdciStatsSubSystem(); - ~RdciStatsSubSystem(); - void parse_cmd_opts(int argc, char ** argv) override; - void process() override; + RdciStatsSubSystem(); + ~RdciStatsSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; private: - void show_help() const; - void show_job_stats(const rdc_gpu_usage_info_t& gpu_info) const; - void show_job_stats_json(const rdc_gpu_usage_info_t& gpu_info) const; + void show_help() const; + void show_job_stats(const rdc_gpu_usage_info_t& gpu_info) const; + void show_job_stats_json(const rdc_gpu_usage_info_t& gpu_info) const; - enum OPERATIONS { - STATS_UNKNOWN = 0, - STATS_HELP, - STATS_START_RECORDING, - STATS_STOP_RECORDING, - STATS_DISPLAY, - STATS_REMOVE, - STATS_REMOVE_ALL - } stats_ops_; + enum OPERATIONS { + STATS_UNKNOWN = 0, + STATS_HELP, + STATS_START_RECORDING, + STATS_STOP_RECORDING, + STATS_DISPLAY, + STATS_REMOVE, + STATS_REMOVE_ALL + } stats_ops_; - - std::string job_id_; - uint32_t group_id_; - bool is_verbose_ = false; + std::string job_id_; + uint32_t group_id_; + bool is_verbose_ = false; }; - } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCISTATSSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/include/RdciSubSystem.h b/projects/rdc/rdci/include/RdciSubSystem.h index df8d65ff20..0c296cd793 100644 --- a/projects/rdc/rdci/include/RdciSubSystem.h +++ b/projects/rdc/rdci/include/RdciSubSystem.h @@ -25,38 +25,38 @@ THE SOFTWARE. #include #include #include -#include "rdc_lib/rdc_common.h" + #include "rdc/rdc.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { class RdciSubSystem { public: - RdciSubSystem(); - virtual void parse_cmd_opts(int argc, char ** argv) = 0; - virtual void connect(); + RdciSubSystem(); + virtual void parse_cmd_opts(int argc, char** argv) = 0; + virtual void connect(); - virtual void process() = 0; - virtual ~RdciSubSystem(); + virtual void process() = 0; + virtual ~RdciSubSystem(); - bool is_json_output() const; + bool is_json_output() const; protected: - void set_json_output(bool is_json); - std::vector split_string(const std::string& s, - char delimiter) const; - void show_common_usage() const; - rdc_handle_t rdc_handle_; - std::string ip_port_; + void set_json_output(bool is_json); + std::vector split_string(const std::string& s, char delimiter) const; + void show_common_usage() const; + rdc_handle_t rdc_handle_; + std::string ip_port_; - bool use_auth_; - std::string root_ca_; - std::string client_cert_; - std::string client_key_; + bool use_auth_; + std::string root_ca_; + std::string client_cert_; + std::string client_key_; private: - bool is_json_output_; + bool is_json_output_; }; typedef std::shared_ptr RdciSubSystemPtr; @@ -64,5 +64,4 @@ typedef std::shared_ptr RdciSubSystemPtr; } // namespace rdc } // namespace amd - #endif // RDCI_INCLUDE_RDCISUBSYSTEM_H_ diff --git a/projects/rdc/rdci/src/RdciDiagSubSystem.cc b/projects/rdc/rdci/src/RdciDiagSubSystem.cc index 3dfecc5758..d3bbc38635 100644 --- a/projects/rdc/rdci/src/RdciDiagSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDiagSubSystem.cc @@ -20,187 +20,164 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "RdciDiagSubSystem.h" + +#include #include -#include #include #include -#include +#include -#include +#include #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include "rdc_lib/rdc_common.h" -#include "common/rdc_utils.h" #include "common/rdc_fields_supported.h" +#include "common/rdc_utils.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { +RdciDiagSubSystem::RdciDiagSubSystem() : diag_ops_(DIAG_RUN), run_level_(RDC_DIAG_LVL_SHORT) {} -RdciDiagSubSystem::RdciDiagSubSystem(): diag_ops_(DIAG_RUN) - , run_level_(RDC_DIAG_LVL_SHORT) { -} +RdciDiagSubSystem::~RdciDiagSubSystem() {} -RdciDiagSubSystem::~RdciDiagSubSystem() { -} +void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"run-level", required_argument, nullptr, 'r'}, + {"group-id", required_argument, nullptr, 'g'}, + {nullptr, 0, nullptr, 0}}; -void RdciDiagSubSystem::parse_cmd_opts(int argc, char ** argv) { - const int HOST_OPTIONS = 1000; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS}, - {"help", optional_argument, nullptr, 'h'}, - {"unauth", optional_argument, nullptr, 'u'}, - {"run-level", required_argument, nullptr, 'r'}, - {"group-id", required_argument, nullptr, 'g'}, - { nullptr, 0 , nullptr, 0 } - }; + bool group_id_set = false; + int option_index = 0; + int opt = 0; - bool group_id_set = false; - int option_index = 0; - int opt = 0; - - while ((opt = getopt_long(argc, argv, "hug:r:", - long_options, &option_index)) != -1) { - switch (opt) { - case HOST_OPTIONS: - ip_port_ = optarg; - break; - case 'h': - diag_ops_ = DIAG_HELP; - return; - case 'u': - use_auth_ = false; - break; - case 'g': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - group_id_ = std::stoi(optarg); - group_id_set = true; - break; - case 'r': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The run level needs to be a number"); - } - run_level_ = static_cast(std::stoi(optarg)); - break; - default: - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Unknown command line options"); + while ((opt = getopt_long(argc, argv, "hug:r:", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + diag_ops_ = DIAG_HELP; + return; + case 'u': + use_auth_ = false; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); } - } - - if (!group_id_set) { + group_id_ = std::stoi(optarg); + group_id_set = true; + break; + case 'r': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The run level needs to be a number"); + } + run_level_ = static_cast(std::stoi(optarg)); + break; + default: show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the GPU group id"); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } + } + + if (!group_id_set) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id"); + } } void RdciDiagSubSystem::show_help() const { - // Try to keep total output line length to <= 80 chars for better - // readability. For reference: - // *********************** 60 Chars ************************** - // ************** 40 Chars *************** - // ***** 20 Chars **** - std::cout << " diag -- Used to run diagnostic for GPUs.\n\n"; - std::cout << "Usage\n"; - std::cout << " rdci diag [--host :port] [-u] -g " + // Try to keep total output line length to <= 80 chars for better + // readability. For reference: + // *********************** 60 Chars ************************** + // ************** 40 Chars *************** + // ***** 20 Chars **** + std::cout << " diag -- Used to run diagnostic for GPUs.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci diag [--host :port] [-u] -g " << " -r \n"; - std::cout << "\nFlags:\n"; - show_common_usage(); - std::cout << " -g --group-id The GPU group to diagnose" - << " on the specified host.\n"; - std::cout << " -r --run-level level Integer representing test" - << " run levels [default = 1].\n" - << " level 1: Tests take a " - << "few seconds to run.\n" - << " level 2: Tests take a " - << "few minutes to run (To be implemented).\n" - << " level 3: Tests take " - << "half an hour to run (To be implemented).\n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -g --group-id The GPU group to diagnose" + << " on the specified host.\n"; + std::cout << " -r --run-level level Integer representing test" + << " run levels [default = 1].\n" + << " level 1: Tests take a " + << "few seconds to run.\n" + << " level 2: Tests take a " + << "few minutes to run (To be implemented).\n" + << " level 3: Tests take " + << "half an hour to run (To be implemented).\n"; } -std::string RdciDiagSubSystem::get_test_name - (rdc_diag_test_cases_t test_case) const { - const std::map test_desc = { - {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, - {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, - {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, - {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, - {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, - {RDC_DIAG_TEST_LAST, "Unknown"} - }; +std::string RdciDiagSubSystem::get_test_name(rdc_diag_test_cases_t test_case) const { + const std::map test_desc = { + {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, + {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, + {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, + {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, + {RDC_DIAG_TEST_LAST, "Unknown"}}; - auto test_name = test_desc.find(test_case); - if (test_name == test_desc.end()) { - return "Unknown Test"; - } - return test_name->second; + auto test_name = test_desc.find(test_case); + if (test_name == test_desc.end()) { + return "Unknown Test"; + } + return test_name->second; } void RdciDiagSubSystem::process() { - if (diag_ops_ == DIAG_HELP || - diag_ops_ == DIAG_UNKNOWN) { - show_help(); - return; - } + if (diag_ops_ == DIAG_HELP || diag_ops_ == DIAG_UNKNOWN) { + show_help(); + return; + } - rdc_status_t result; - rdc_diag_response_t response; - result = rdc_diagnostic_run(rdc_handle_, group_id_, - run_level_, &response); + rdc_status_t result; + rdc_diag_response_t response; + result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, &response); - if (result != RDC_ST_OK) { - std::string error_msg = rdc_status_string(result); - throw RdcException(result, error_msg.c_str()); - } + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + throw RdcException(result, error_msg.c_str()); + } - // (3) Check diagnostic results - for (uint32_t i=0 ; i < response.results_count; i++) { - const rdc_diag_test_result_t& test_result = - response.diag_info[i]; - std::cout << std::setw(26) << std::left - << get_test_name(test_result.test_case) + ":" - << rdc_diagnostic_result_string(test_result.status) << "\n"; - } + // (3) Check diagnostic results + for (uint32_t i = 0; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = response.diag_info[i]; + std::cout << std::setw(26) << std::left << get_test_name(test_result.test_case) + ":" + << rdc_diagnostic_result_string(test_result.status) << "\n"; + } - // (4) diagnostic detail information - std::cout <<" =============== Diagnostic Details ==================\n"; - for (uint32_t i=0 ; i < response.results_count; i++) { - const rdc_diag_test_result_t& test_result = - response.diag_info[i]; - if (test_result.info[0] != '\0') { - std::cout << std::setw(26) << std::left - << get_test_name(test_result.test_case) + ":" + // (4) diagnostic detail information + std::cout << " =============== Diagnostic Details ==================\n"; + for (uint32_t i = 0; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = response.diag_info[i]; + if (test_result.info[0] != '\0') { + std::cout << std::setw(26) << std::left << get_test_name(test_result.test_case) + ":" << test_result.info << "\n"; - } - for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { - const rdc_diag_per_gpu_result_t& gpu_result - = test_result.gpu_results[j]; - if (strlen(gpu_result.gpu_result.msg) > 0) { - std::cout << " GPU " << gpu_result.gpu_index << " " << - gpu_result.gpu_result.msg << "\n"; - } - } } + for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) { + const rdc_diag_per_gpu_result_t& gpu_result = test_result.gpu_results[j]; + if (strlen(gpu_result.gpu_result.msg) > 0) { + std::cout << " GPU " << gpu_result.gpu_index << " " << gpu_result.gpu_result.msg << "\n"; + } + } + } } - } // namespace rdc } // namespace amd - - diff --git a/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc b/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc index 02a411e8d1..0a50c78098 100644 --- a/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc +++ b/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc @@ -19,140 +19,127 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include "rdc_lib/rdc_common.h" -#include "rdc/rdc.h" -#include "rdc_lib/RdcException.h" #include "RdciDiscoverySubSystem.h" +#include +#include + +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdciDiscoverySubSystem::RdciDiscoverySubSystem() : show_help_(false) { -} +RdciDiscoverySubSystem::RdciDiscoverySubSystem() : show_help_(false) {} -void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) { - const int HOST_OPTIONS = 1000; - const int JSON_OPTIONS = 1001; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS }, - {"help", optional_argument, nullptr, 'h' }, - {"unauth", optional_argument, nullptr, 'u' }, - {"list", optional_argument, nullptr, 'l' }, - {"json", optional_argument, nullptr, JSON_OPTIONS }, - { nullptr, 0 , nullptr, 0 } - }; +void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const int JSON_OPTIONS = 1001; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS}, {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, {"list", optional_argument, nullptr, 'l'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, {nullptr, 0, nullptr, 0}}; - int option_index = 0; - int opt = 0; - bool is_list = false; + int option_index = 0; + int opt = 0; + bool is_list = false; - while ((opt = getopt_long(argc, argv, "hlu", - long_options, &option_index)) != -1) { - switch (opt) { - case HOST_OPTIONS: - ip_port_ = optarg; - break; - case JSON_OPTIONS: - set_json_output(true); - break; - case 'h': - show_help_ = true; - return; - case 'u': - use_auth_ = false; - break; - case 'l': - is_list = true; - break; - default: - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Unknown command line options"); - } - } - - if (!is_list) { + while ((opt = getopt_long(argc, argv, "hlu", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case JSON_OPTIONS: + set_json_output(true); + break; + case 'h': + show_help_ = true; + return; + case 'u': + use_auth_ = false; + break; + case 'l': + is_list = true; + break; + default: show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify operations"); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } + } + + if (!is_list) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify operations"); + } } void RdciDiscoverySubSystem::show_help() const { - if (is_json_output()) return; - std::cout << " discovery -- Used to discover and identify GPUs " - << "and their attributes.\n\n"; - std::cout << "Usage\n"; - std::cout << " rdci discovery [--host :port] [--json]" + if (is_json_output()) return; + std::cout << " discovery -- Used to discover and identify GPUs " + << "and their attributes.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci discovery [--host :port] [--json]" << " [-u] -l\n"; - std::cout << "\nFlags:\n"; - show_common_usage(); - std::cout << " --json " - << "Output using json.\n"; - std::cout << " -l --list list GPU discovered" - <<" on the system\n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " --json " + << "Output using json.\n"; + std::cout << " -l --list list GPU discovered" + << " on the system\n"; } - void RdciDiscoverySubSystem::process() { - if (show_help_) { - return show_help(); - } + if (show_help_) { + return show_help(); + } - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; - uint32_t count = 0; - rdc_status_t result = rdc_device_get_all(rdc_handle_, - gpu_index_list, &count); - if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to get device information"); - } - if (count == 0) { - if (is_json_output()) { - std::cout << "\"gpus\" : [], \"status\": \"ok\""; - } else { - std::cout << "No GPUs find on the system\n"; - } - return; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; + rdc_status_t result = rdc_device_get_all(rdc_handle_, gpu_index_list, &count); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to get device information"); + } + if (count == 0) { + if (is_json_output()) { + std::cout << "\"gpus\" : [], \"status\": \"ok\""; + } else { + std::cout << "No GPUs find on the system\n"; } + return; + } - if (is_json_output()) { - std::cout << "\"gpus\" : ["; - } else { - std::cout << count << " GPUs found.\n"; - std::cout << "------------------------------------------------" - << "-----------------\n"; - std::cout << "GPU Index\t Device Information\n"; - } - for (uint32_t i = 0; i < count; i++) { - rdc_device_attributes_t attribute; - result = rdc_device_get_attributes(rdc_handle_, - gpu_index_list[i], &attribute); - if (result != RDC_ST_OK) { - return; - } - if (is_json_output()) { - std::cout << "{\"gpu_index\": \"" << i << "\", \"device_name\": \"" - << attribute.device_name << "\"}"; - if (i != count -1) { - std::cout << ","; - } - } else { - std::cout << i << "\t\t" << attribute.device_name < -#include -#include + #include +#include +#include +#include #include -#include +#include #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include "rdc_lib/rdc_common.h" -#include "common/rdc_utils.h" #include "common/rdc_fields_supported.h" +#include "common/rdc_utils.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { @@ -48,301 +49,280 @@ namespace rdc { // to notify the program to clean up the resources created by the subsystem. volatile sig_atomic_t RdciDmonSubSystem::is_terminating_ = 0; -RdciDmonSubSystem::RdciDmonSubSystem(): - dmon_ops_(DMON_MONITOR) - , need_cleanup_(false) - , show_timpstamps_(false) { - signal(SIGINT, set_terminating); +RdciDmonSubSystem::RdciDmonSubSystem() + : dmon_ops_(DMON_MONITOR), need_cleanup_(false), show_timpstamps_(false) { + signal(SIGINT, set_terminating); } -RdciDmonSubSystem::~RdciDmonSubSystem() { - clean_up(); -} +RdciDmonSubSystem::~RdciDmonSubSystem() { clean_up(); } void RdciDmonSubSystem::set_terminating(int sig) { if (sig == SIGINT) { - is_terminating_ = 1; + is_terminating_ = 1; } } -void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { - const int HOST_OPTIONS = 1000; - const int LIST_ALL_FIELDS_OPT = 1001; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS}, - {"help", optional_argument, nullptr, 'h'}, - {"unauth", optional_argument, nullptr, 'u'}, - {"list", optional_argument, nullptr, 'l'}, - {"time-stamp", optional_argument, nullptr, 't'}, - {"list-all", optional_argument, nullptr, LIST_ALL_FIELDS_OPT}, - {"field-group-id", required_argument, nullptr, 'f'}, - {"field-id", required_argument, nullptr, 'e' }, - {"gpu_index", required_argument, nullptr, 'i'}, - {"group-id", required_argument, nullptr, 'g'}, - {"count", required_argument, nullptr, 'c'}, - {"delay", required_argument, nullptr, 'd'}, - { nullptr, 0 , nullptr, 0 } - }; +void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const int LIST_ALL_FIELDS_OPT = 1001; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"list", optional_argument, nullptr, 'l'}, + {"time-stamp", optional_argument, nullptr, 't'}, + {"list-all", optional_argument, nullptr, LIST_ALL_FIELDS_OPT}, + {"field-group-id", required_argument, nullptr, 'f'}, + {"field-id", required_argument, nullptr, 'e'}, + {"gpu_index", required_argument, nullptr, 'i'}, + {"group-id", required_argument, nullptr, 'g'}, + {"count", required_argument, nullptr, 'c'}, + {"delay", required_argument, nullptr, 'd'}, + {nullptr, 0, nullptr, 0}}; - int option_index = 0; - int opt = 0; - std::string gpu_indexes; - std::string field_ids; + int option_index = 0; + int opt = 0; + std::string gpu_indexes; + std::string field_ids; - while ((opt = getopt_long(argc, argv, "hltuf:g:c:d:e:i:", - long_options, &option_index)) != -1) { - switch (opt) { - case HOST_OPTIONS: - ip_port_ = optarg; - break; - case 'h': - dmon_ops_ = DMON_HELP; - return; - case 'u': - use_auth_ = false; - break; - case 't': - show_timpstamps_ = true; - break; - case 'l': - dmon_ops_ = DMON_LIST_FIELDS; - break; - case 'f': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The field group id needs to be a number"); - } - options_.insert({OPTIONS_FIELD_GROUP_ID, std::stoi(optarg)}); - break; - case 'e': - field_ids = optarg; - break; - case 'i': - gpu_indexes = optarg; - break; - case 'g': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - options_.insert({OPTIONS_GROUP_ID, std::stoi(optarg)}); - break; - case 'c': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The count needs to be a number"); - } - options_.insert({OPTIONS_COUNT, std::stoi(optarg)}); - break; - case 'd': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The delay needs to be a number"); - } - options_.insert({OPTIONS_DELAY, std::stoi(optarg)}); - break; - case LIST_ALL_FIELDS_OPT: - dmon_ops_ = DMON_LIST_ALL_FIELDS; - break; - default: - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Unknown command line options"); - } - } - - if (dmon_ops_ == DMON_LIST_FIELDS || dmon_ops_ == DMON_LIST_ALL_FIELDS) { + while ((opt = getopt_long(argc, argv, "hltuf:g:c:d:e:i:", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + dmon_ops_ = DMON_HELP; return; - } - - if (options_.find(OPTIONS_FIELD_GROUP_ID) == options_.end()) { - if (field_ids == "") { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the fields or field group id"); - } else { - std::vector vec_ids = split_string(field_ids, ','); - for (uint32_t i = 0; i < vec_ids.size(); i++) { - if (!IsNumber(vec_ids[i])) { - rdc_field_t field_id = RDC_FI_INVALID; - if (!amd::rdc::get_field_id_from_name(vec_ids[i], - &field_id)) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The field name "+vec_ids[i]+" is not valid"); - } - field_ids_.push_back(field_id); - } else { - field_ids_.push_back(static_cast( - std::stoi(vec_ids[i]))); - } - } + case 'u': + use_auth_ = false; + break; + case 't': + show_timpstamps_ = true; + break; + case 'l': + dmon_ops_ = DMON_LIST_FIELDS; + break; + case 'f': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The field group id needs to be a number"); } - } - - if (options_.find(OPTIONS_GROUP_ID) == options_.end()) { - if (gpu_indexes == "") { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the GPUs or group id"); - } else { - std::vector vec_ids = split_string(gpu_indexes, ','); - for (uint32_t i = 0; i < vec_ids.size(); i++) { - if (!IsNumber(vec_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The GPU index "+vec_ids[i]+" needs to be a number"); - } - gpu_indexes_.push_back(std::stoi(vec_ids[i])); - } + options_.insert({OPTIONS_FIELD_GROUP_ID, std::stoi(optarg)}); + break; + case 'e': + field_ids = optarg; + break; + case 'i': + gpu_indexes = optarg; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); } + options_.insert({OPTIONS_GROUP_ID, std::stoi(optarg)}); + break; + case 'c': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The count needs to be a number"); + } + options_.insert({OPTIONS_COUNT, std::stoi(optarg)}); + break; + case 'd': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The delay needs to be a number"); + } + options_.insert({OPTIONS_DELAY, std::stoi(optarg)}); + break; + case LIST_ALL_FIELDS_OPT: + dmon_ops_ = DMON_LIST_ALL_FIELDS; + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } + } - // Group and GPU index cannot co-exist - if (gpu_indexes != "" && - options_.find(OPTIONS_GROUP_ID) != options_.end()) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Use either the group or GPU indexes"); - } + if (dmon_ops_ == DMON_LIST_FIELDS || dmon_ops_ == DMON_LIST_ALL_FIELDS) { + return; + } - // Field group and field Ids cannot co-exist - if (field_ids != "" && - options_.find(OPTIONS_FIELD_GROUP_ID) != options_.end()) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Use either the field group or field IDs"); + if (options_.find(OPTIONS_FIELD_GROUP_ID) == options_.end()) { + if (field_ids == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the fields or field group id"); + } else { + std::vector vec_ids = split_string(field_ids, ','); + for (uint32_t i = 0; i < vec_ids.size(); i++) { + if (!IsNumber(vec_ids[i])) { + rdc_field_t field_id = RDC_FI_INVALID; + if (!amd::rdc::get_field_id_from_name(vec_ids[i], &field_id)) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field name " + vec_ids[i] + " is not valid"); + } + field_ids_.push_back(field_id); + } else { + field_ids_.push_back(static_cast(std::stoi(vec_ids[i]))); + } + } } + } - // Set default delay to 1 second - if (options_.find(OPTIONS_DELAY) == options_.end()) { - options_.insert({OPTIONS_DELAY, 1000}); + if (options_.find(OPTIONS_GROUP_ID) == options_.end()) { + if (gpu_indexes == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPUs or group id"); + } else { + std::vector vec_ids = split_string(gpu_indexes, ','); + for (uint32_t i = 0; i < vec_ids.size(); i++) { + if (!IsNumber(vec_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU index " + vec_ids[i] + " needs to be a number"); + } + gpu_indexes_.push_back(std::stoi(vec_ids[i])); + } } + } - // Set default count to max integer - if (options_.find(OPTIONS_COUNT) == options_.end()) { - options_.insert({OPTIONS_COUNT, std::numeric_limits::max()}); - } + // Group and GPU index cannot co-exist + if (gpu_indexes != "" && options_.find(OPTIONS_GROUP_ID) != options_.end()) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Use either the group or GPU indexes"); + } + + // Field group and field Ids cannot co-exist + if (field_ids != "" && options_.find(OPTIONS_FIELD_GROUP_ID) != options_.end()) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Use either the field group or field IDs"); + } + + // Set default delay to 1 second + if (options_.find(OPTIONS_DELAY) == options_.end()) { + options_.insert({OPTIONS_DELAY, 1000}); + } + + // Set default count to max integer + if (options_.find(OPTIONS_COUNT) == options_.end()) { + options_.insert({OPTIONS_COUNT, std::numeric_limits::max()}); + } } void RdciDmonSubSystem::show_help() const { - // Try to keep total output line length to <= 80 chars for better - // readability. For reference: - // *********************** 60 Chars ************************** - // ************** 40 Chars *************** - // ***** 20 Chars **** - std::cout << " dmon -- Used to monitor GPUs and their stats.\n\n"; - std::cout << "Usage\n"; - std::cout << " rdci dmon [--host :port] [-u] -f " + // Try to keep total output line length to <= 80 chars for better + // readability. For reference: + // *********************** 60 Chars ************************** + // ************** 40 Chars *************** + // ***** 20 Chars **** + std::cout << " dmon -- Used to monitor GPUs and their stats.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci dmon [--host :port] [-u] -f " << " -g \n"; - std::cout << " [-d ] [-c ]\n"; - std::cout << " rdci dmon [--host :port] [-u] -e " - << " -i \n"; - std::cout << " [-d ] [-c ]\n"; - std::cout << " rdci dmon [--host :port] [-u] -l \n"; - std::cout << "\nFlags:\n"; - show_common_usage(); - std::cout << " -f --field-group-id The field group " - << "to query on the specified host.\n"; - std::cout << " -g --group-id The GPU group to query " - << "on the specified host.\n"; - std::cout << " -c --count count Integer representing How" - << " many times to loop\n" - << " before exiting. [default " - << "= runs forever.]\n"; - std::cout << " -e --field-id fieldIds Comma-separated list " - << "of field ids to monitor.\n"; - std::cout << " -i --gpu_index gpuIndexes Comma-separated list " - << "of GPU indexes to monitor.\n"; - std::cout << " -d --delay delay How often to query RDC " - << "in milli seconds. \n" - << " [default = 1000 msec, " - << "Minimum value = 100 msec.]\n"; - std::cout << " -l --list List to look up the long " - << "names and \n" - << " descriptions of the field " - << "ids\n"; - std::cout << " -t --time-stamp Include timestamps in " - << "display\n"; - std::cout << " --list-all Same as -l, except this " - << "lists all possible\n" - << " fields, including " - << "those that are less \n" - << " commonly used.\n"; + std::cout << " [-d ] [-c ]\n"; + std::cout << " rdci dmon [--host :port] [-u] -e " + << " -i \n"; + std::cout << " [-d ] [-c ]\n"; + std::cout << " rdci dmon [--host :port] [-u] -l \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -f --field-group-id The field group " + << "to query on the specified host.\n"; + std::cout << " -g --group-id The GPU group to query " + << "on the specified host.\n"; + std::cout << " -c --count count Integer representing How" + << " many times to loop\n" + << " before exiting. [default " + << "= runs forever.]\n"; + std::cout << " -e --field-id fieldIds Comma-separated list " + << "of field ids to monitor.\n"; + std::cout << " -i --gpu_index gpuIndexes Comma-separated list " + << "of GPU indexes to monitor.\n"; + std::cout << " -d --delay delay How often to query RDC " + << "in milli seconds. \n" + << " [default = 1000 msec, " + << "Minimum value = 100 msec.]\n"; + std::cout << " -l --list List to look up the long " + << "names and \n" + << " descriptions of the field " + << "ids\n"; + std::cout << " -t --time-stamp Include timestamps in " + << "display\n"; + std::cout << " --list-all Same as -l, except this " + << "lists all possible\n" + << " fields, including " + << "those that are less \n" + << " commonly used.\n"; } void RdciDmonSubSystem::create_temp_group() { - if (gpu_indexes_.size() == 0) { - return; - } + if (gpu_indexes_.size() == 0) { + return; + } - const std::string group_name("rdci-dmon-group"); - rdc_gpu_group_t group_id; - rdc_status_t result = rdc_group_gpu_create(rdc_handle_, - RDC_GROUP_EMPTY, group_name.c_str(), &group_id); + const std::string group_name("rdci-dmon-group"); + rdc_gpu_group_t group_id; + rdc_status_t result = + rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, group_name.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create the dmon group"); + } + need_cleanup_ = true; + + for (uint32_t i = 0; i < gpu_indexes_.size(); i++) { + result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]); if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to create the dmon group"); + throw RdcException(result, + "Fail to add " + std::to_string(gpu_indexes_[i]) + " to the dmon group."); } - need_cleanup_ = true; - - for (uint32_t i = 0; i < gpu_indexes_.size() ; i++) { - result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]); - if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to add " + - std::to_string(gpu_indexes_[i])+" to the dmon group."); - } - } - options_.insert({OPTIONS_GROUP_ID, group_id}); + } + options_.insert({OPTIONS_GROUP_ID, group_id}); } - void RdciDmonSubSystem::create_temp_field_group() { - if (field_ids_.size() == 0) { - return; - } + if (field_ids_.size() == 0) { + return; + } - const std::string field_group_name("rdci-dmon-field-group"); - rdc_field_grp_t group_id; - rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; - for (uint32_t i = 0; i < field_ids_.size(); i++) { - field_ids[i] = field_ids_[i]; - } + const std::string field_group_name("rdci-dmon-field-group"); + rdc_field_grp_t group_id; + rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; + for (uint32_t i = 0; i < field_ids_.size(); i++) { + field_ids[i] = field_ids_[i]; + } - rdc_status_t result = rdc_group_field_create(rdc_handle_, - field_ids_.size(), &field_ids[0], field_group_name.c_str(), &group_id); - if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to create the dmon field group."); - } + rdc_status_t result = rdc_group_field_create(rdc_handle_, field_ids_.size(), &field_ids[0], + field_group_name.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create the dmon field group."); + } - need_cleanup_ = true; - options_.insert({OPTIONS_FIELD_GROUP_ID, group_id}); + need_cleanup_ = true; + options_.insert({OPTIONS_FIELD_GROUP_ID, group_id}); } - void RdciDmonSubSystem::show_field_usage() const { std::cout << "Supported fields Ids:" << std::endl; - amd::rdc::fld_id2name_map_t &field_id_to_descript = - amd::rdc::get_field_id_description_from_id(); - for (auto i = field_id_to_descript.begin(); - i != field_id_to_descript.end(); i++) { + amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id(); + for (auto i = field_id_to_descript.begin(); i != field_id_to_descript.end(); i++) { if (i->second.do_display || dmon_ops_ == DMON_LIST_ALL_FIELDS) { - std::cout << i->first << " " << i->second.enum_name << " : " << - i->second.description << "." << std::endl; + std::cout << i->first << " " << i->second.enum_name << " : " << i->second.description << "." + << std::endl; } } std::cout << std::endl; std::cout << "* Note: The field ID number associated with a field ID can " - "change" << std::endl; + "change" + << std::endl; std::cout << " from release to release. Field name strings should be " - "used in scripts." << std::endl; + "used in scripts." + << std::endl; } -static void separate_notf_events(const rdc_field_group_info_t *f_info, - std::vector *notif, - std::vector *reg_ev) { +static void separate_notf_events(const rdc_field_group_info_t* f_info, + std::vector* notif, + std::vector* reg_ev) { assert(f_info != nullptr && notif != nullptr && reg_ev != nullptr); for (uint32_t i = 0; i < f_info->count; ++i) { @@ -354,24 +334,22 @@ static void separate_notf_events(const rdc_field_group_info_t *f_info, } } -typedef struct { - uint32_t dev_ind; - rdc_field_value val; +typedef struct { + uint32_t dev_ind; + rdc_field_value val; } notif_dev_value; struct Compare_ts { - bool operator()(const notif_dev_value& r1, const notif_dev_value& r2) { - return r1.val.ts > r2.val.ts; - } + bool operator()(const notif_dev_value& r1, const notif_dev_value& r2) { + return r1.val.ts > r2.val.ts; + } }; -typedef std::priority_queue, Compare_ts> field_pq_t; +typedef std::priority_queue, Compare_ts> field_pq_t; -static void collect_new_notifs(rdc_handle_t h, - const rdc_group_info_t &group_info, - const std::vector ¬if_fields, - std::vector *notif_ts, field_pq_t *notif_pq) { +static void collect_new_notifs(rdc_handle_t h, const rdc_group_info_t& group_info, + const std::vector& notif_fields, + std::vector* notif_ts, field_pq_t* notif_pq) { rdc_status_t ret; notif_dev_value value; std::string error_msg; @@ -380,33 +358,33 @@ static void collect_new_notifs(rdc_handle_t h, assert(notif_ts != nullptr); for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { - for (uint32_t findex = 0; findex < notif_fields.size(); findex++) { - // There may be multiple, repeated events; get all of them - while (true) { - ret = rdc_field_get_value_since(h, group_info.entity_ids[gindex], - notif_fields[findex], (*notif_ts)[findex], &next_ts, &value.val); + for (uint32_t findex = 0; findex < notif_fields.size(); findex++) { + // There may be multiple, repeated events; get all of them + while (true) { + ret = rdc_field_get_value_since(h, group_info.entity_ids[gindex], notif_fields[findex], + (*notif_ts)[findex], &next_ts, &value.val); - if (ret == RDC_ST_NOT_FOUND) { - break; - } else if (ret == RDC_ST_OK) { - (*notif_ts)[findex] = next_ts; - value.dev_ind = group_info.entity_ids[gindex]; - if (notif_pq != nullptr) { - notif_pq->push(value); - } - } else { - error_msg = "rdc_field_get_value_since() failed"; - throw RdcException(ret, error_msg.c_str()); + if (ret == RDC_ST_NOT_FOUND) { + break; + } else if (ret == RDC_ST_OK) { + (*notif_ts)[findex] = next_ts; + value.dev_ind = group_info.entity_ids[gindex]; + if (notif_pq != nullptr) { + notif_pq->push(value); } + } else { + error_msg = "rdc_field_get_value_since() failed"; + throw RdcException(ret, error_msg.c_str()); } } + } } } // ts is milliseconds static std::string ts_string(const time_t ts) { - struct tm *timeinfo; - time_t tmp_ts = ts/1000; + struct tm* timeinfo; + time_t tmp_ts = ts / 1000; std::string ret; timeinfo = localtime(&tmp_ts); // NOLINT @@ -416,11 +394,10 @@ static std::string ts_string(const time_t ts) { return ret; } -static void print_and_clr_notif_pq(field_pq_t *notif_pq, bool ts) { +static void print_and_clr_notif_pq(field_pq_t* notif_pq, bool ts) { assert(notif_pq != nullptr); notif_dev_value v; - amd::rdc::fld_id2name_map_t &field_id_to_descript = - amd::rdc::get_field_id_description_from_id(); + amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id(); while (!notif_pq->empty()) { v = notif_pq->top(); notif_pq->pop(); @@ -428,12 +405,10 @@ static void print_and_clr_notif_pq(field_pq_t *notif_pq, bool ts) { std::cout << v.dev_ind << "\t"; if (ts) { - std::cout << std::left << std::setw(25) << - ts_string(v.val.ts); + std::cout << std::left << std::setw(25) << ts_string(v.val.ts); } - std::cout << std::left << " **Event: " << - field_id_to_descript.at(v.val.field_id).label; + std::cout << std::left << " **Event: " << field_id_to_descript.at(v.val.field_id).label; std::cout << std::left << "\t\"" << v.val.value.str << "\""; std::cout << std::endl; @@ -441,188 +416,170 @@ static void print_and_clr_notif_pq(field_pq_t *notif_pq, bool ts) { } void RdciDmonSubSystem::process() { - if (dmon_ops_ == DMON_HELP || - dmon_ops_ == DMON_UNKNOWN) { - show_help(); - return; + if (dmon_ops_ == DMON_HELP || dmon_ops_ == DMON_UNKNOWN) { + show_help(); + return; + } + + if (dmon_ops_ == DMON_LIST_FIELDS || dmon_ops_ == DMON_LIST_ALL_FIELDS) { + show_field_usage(); + return; + } + + rdc_status_t result; + rdc_group_info_t group_info; + rdc_field_group_info_t field_info; + + // Create a temporary group/field if pass as GPU indexes or field ids + create_temp_group(); + create_temp_field_group(); + + result = rdc_group_gpu_get_info(rdc_handle_, options_[OPTIONS_GROUP_ID], &group_info); + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the group " + std::to_string(options_[OPTIONS_GROUP_ID]); + } + throw RdcException(result, error_msg.c_str()); + } + if (group_info.count == 0) { + throw RdcException(RDC_ST_NOT_FOUND, "The gpu group " + + std::to_string(options_[OPTIONS_GROUP_ID]) + + " must contain at least 1 GPU."); + } + result = rdc_group_field_get_info(rdc_handle_, options_[OPTIONS_FIELD_GROUP_ID], &field_info); + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the field group " + std::to_string(options_[OPTIONS_FIELD_GROUP_ID]); + } + throw RdcException(result, error_msg.c_str()); + } + if (field_info.count == 0) { + throw RdcException(RDC_ST_NOT_FOUND, "The field group " + + std::to_string(options_[OPTIONS_FIELD_GROUP_ID]) + + " must contain at least 1 field."); + } + // Divide field_info fields into 2 vectors, 1 for notifications + // and one for non-notifications. Handle these separately below. + std::vector notif_fields; + std::vector reg_fields; + separate_notf_events(&field_info, ¬if_fields, ®_fields); + + // keep extra 1 minute data + double max_keep_age = options_[OPTIONS_DELAY] / 1000.0 + 60; + const int max_keep_samples = 10; // keep only 10 samples + result = + rdc_field_watch(rdc_handle_, options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID], + options_[OPTIONS_DELAY] * 1000, max_keep_age, max_keep_samples); + need_cleanup_ = true; + + std::stringstream ss; + amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id(); + + if (notif_fields.size() > 0) { + ss << "Listening for events: "; + uint32_t i; + for (i = 0; i < notif_fields.size() - 1; ++i) { + ss << field_id_to_descript.at(notif_fields[i]).label << ", "; + } + ss << field_id_to_descript.at(notif_fields[i]).label << std::endl; + } + ss << "GPU\t"; + if (show_timpstamps_) { + ss << std::left << std::setw(25) << "TIMESTAMP"; + ss << " "; + } + for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { + ss << std::left << std::setw(20) << field_id_string(reg_fields[findex]); + } + ss << std::endl; + + std::string header_line((std::istreambuf_iterator(ss)), (std::istreambuf_iterator())); + + std::vector notif_ts(notif_fields.size()); + field_pq_t notif_pq; + + // Call this once without printing out notfications to initialize + // timestamps. There may be very stale timestamps in cache. + collect_new_notifs(rdc_handle_, group_info, notif_fields, ¬if_ts, nullptr); + + for (uint32_t i = 0; i < options_[OPTIONS_COUNT]; i++) { + if (i % 50 == 0) { + std::cout << header_line; } - if (dmon_ops_ == DMON_LIST_FIELDS || dmon_ops_ == DMON_LIST_ALL_FIELDS) { - show_field_usage(); - return; - } + usleep(options_[OPTIONS_DELAY] * 1000); - rdc_status_t result; - rdc_group_info_t group_info; - rdc_field_group_info_t field_info; + collect_new_notifs(rdc_handle_, group_info, notif_fields, ¬if_ts, ¬if_pq); - // Create a temporary group/field if pass as GPU indexes or field ids - create_temp_group(); - create_temp_field_group(); + print_and_clr_notif_pq(¬if_pq, show_timpstamps_); - result = rdc_group_gpu_get_info(rdc_handle_, - options_[OPTIONS_GROUP_ID], &group_info); - if (result != RDC_ST_OK) { - std::string error_msg = rdc_status_string(result); - if (result == RDC_ST_NOT_FOUND) { - error_msg = "Cannot find the group " + - std::to_string(options_[OPTIONS_GROUP_ID]); + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + std::cout << group_info.entity_ids[gindex] << "\t"; + for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { + rdc_field_value value; + + result = rdc_field_get_latest_value(rdc_handle_, group_info.entity_ids[gindex], + reg_fields[findex], &value); + if (result != RDC_ST_OK) { + std::cout << std::left << std::setw(20) << "N/A"; + } else { + if (show_timpstamps_ && findex == 0) { + std::cout << std::left << std::setw(25) << ts_string(value.ts) << " "; + } + + if (value.type == INTEGER) { + std::cout << std::left << std::setw(20) << value.value.l_int; + } else if (value.type == DOUBLE) { + std::cout << std::left << std::setw(20) << std::fixed << std::setprecision(3) + << value.value.dbl; + } else { + std::cout << std::left << std::setw(20) << value.value.str; + } } - throw RdcException(result, error_msg.c_str()); - } - if (group_info.count == 0) { - throw RdcException(RDC_ST_NOT_FOUND, "The gpu group " + - std::to_string(options_[OPTIONS_GROUP_ID]) - + " must contain at least 1 GPU."); - } - result = rdc_group_field_get_info(rdc_handle_, - options_[OPTIONS_FIELD_GROUP_ID], &field_info); - if (result != RDC_ST_OK) { - std::string error_msg = rdc_status_string(result); - if (result == RDC_ST_NOT_FOUND) { - error_msg = "Cannot find the field group " + - std::to_string(options_[OPTIONS_FIELD_GROUP_ID]); + + if (is_terminating_) { + clean_up(); + return; } - throw RdcException(result, error_msg.c_str()); - } - if (field_info.count == 0) { - throw RdcException(RDC_ST_NOT_FOUND, "The field group " + - std::to_string(options_[OPTIONS_FIELD_GROUP_ID]) - + " must contain at least 1 field."); - } - // Divide field_info fields into 2 vectors, 1 for notifications - // and one for non-notifications. Handle these separately below. - std::vector notif_fields; - std::vector reg_fields; - separate_notf_events(&field_info, ¬if_fields, ®_fields); - - // keep extra 1 minute data - double max_keep_age = options_[OPTIONS_DELAY]/1000.0 + 60; - const int max_keep_samples = 10; // keep only 10 samples - result = rdc_field_watch(rdc_handle_, - options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID], - options_[OPTIONS_DELAY]*1000, max_keep_age, max_keep_samples); - need_cleanup_ = true; - - std::stringstream ss; - amd::rdc::fld_id2name_map_t &field_id_to_descript = - amd::rdc::get_field_id_description_from_id(); - - if (notif_fields.size() > 0) { - ss << "Listening for events: "; - uint32_t i; - for (i = 0; i < notif_fields.size() - 1; ++i) { - ss << field_id_to_descript.at(notif_fields[i]).label << ", "; } - ss << field_id_to_descript.at(notif_fields[i]).label << std::endl; + if (reg_fields.size()) { + std::cout << std::endl; + } } - ss << "GPU\t"; - if (show_timpstamps_) { - ss << std::left << std::setw(25) << "TIMESTAMP"; - ss << " "; - } - for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { - ss << std::left << std::setw(20) - << field_id_string(reg_fields[findex]); - } - ss << std::endl; + } - std::string header_line((std::istreambuf_iterator(ss)), - (std::istreambuf_iterator())); - - std::vector notif_ts(notif_fields.size()); - field_pq_t notif_pq; - - // Call this once without printing out notfications to initialize - // timestamps. There may be very stale timestamps in cache. - collect_new_notifs(rdc_handle_, group_info, notif_fields, - ¬if_ts, nullptr); - - for (uint32_t i = 0; i < options_[OPTIONS_COUNT]; i++) { - if (i % 50 == 0) { - std::cout << header_line; - } - - usleep(options_[OPTIONS_DELAY]*1000); - - collect_new_notifs(rdc_handle_, group_info, notif_fields, - ¬if_ts, ¬if_pq); - - print_and_clr_notif_pq(¬if_pq, show_timpstamps_); - - for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { - std::cout << group_info.entity_ids[gindex] << "\t"; - for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { - rdc_field_value value; - - result = rdc_field_get_latest_value(rdc_handle_, - group_info.entity_ids[gindex], reg_fields[findex], &value); - if (result != RDC_ST_OK) { - std::cout << std::left << std::setw(20) << "N/A"; - } else { - if (show_timpstamps_ && findex == 0) { - std::cout << std::left << std::setw(25) << - ts_string(value.ts) << " "; - } - - if (value.type == INTEGER) { - std::cout << std::left << std::setw(20) - << value.value.l_int; - } else if (value.type == DOUBLE) { - std::cout << std::left << std::setw(20) - << std::fixed << std::setprecision(3) - << value.value.dbl; - } else { - std::cout << std::left << std::setw(20) - << value.value.str; - } - } - - if (is_terminating_) { - clean_up(); - return; - } - } - if (reg_fields.size()) { - std::cout << std::endl; - } - } - } - - clean_up(); + clean_up(); } - void RdciDmonSubSystem::clean_up() { - if (!need_cleanup_) { - return; - } + if (!need_cleanup_) { + return; + } - // Not throw the errors in order to clean up all resources created - if (options_.find(OPTIONS_GROUP_ID) != options_.end() && - options_.find(OPTIONS_FIELD_GROUP_ID) != options_.end()) { - rdc_field_unwatch(rdc_handle_, options_[OPTIONS_GROUP_ID], - options_[OPTIONS_FIELD_GROUP_ID]); - } + // Not throw the errors in order to clean up all resources created + if (options_.find(OPTIONS_GROUP_ID) != options_.end() && + options_.find(OPTIONS_FIELD_GROUP_ID) != options_.end()) { + rdc_field_unwatch(rdc_handle_, options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID]); + } - if (gpu_indexes_.size() != 0) { - auto group = options_.find(OPTIONS_GROUP_ID); - if (group != options_.end()) { - rdc_group_gpu_destroy(rdc_handle_, group->second); - } + if (gpu_indexes_.size() != 0) { + auto group = options_.find(OPTIONS_GROUP_ID); + if (group != options_.end()) { + rdc_group_gpu_destroy(rdc_handle_, group->second); } + } - if (field_ids_.size() != 0) { - auto fgroup = options_.find(OPTIONS_FIELD_GROUP_ID); - if (fgroup != options_.end()) { - rdc_group_field_destroy(rdc_handle_, fgroup->second); - } + if (field_ids_.size() != 0) { + auto fgroup = options_.find(OPTIONS_FIELD_GROUP_ID); + if (fgroup != options_.end()) { + rdc_group_field_destroy(rdc_handle_, fgroup->second); } + } - need_cleanup_ = false; + need_cleanup_ = false; } } // namespace rdc } // namespace amd - - diff --git a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc index a4a0a1295a..3ce66e10f9 100644 --- a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc @@ -20,303 +20,280 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "RdciFieldGroupSubSystem.h" + #include #include -#include "rdc_lib/rdc_common.h" -#include "common/rdc_utils.h" + #include "common/rdc_fields_supported.h" +#include "common/rdc_utils.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdciFieldGroupSubSystem::RdciFieldGroupSubSystem(): - field_group_ops_(FIELD_GROUP_UNKNOWN) - , is_group_set_(false) { -} +RdciFieldGroupSubSystem::RdciFieldGroupSubSystem() + : field_group_ops_(FIELD_GROUP_UNKNOWN), is_group_set_(false) {} -void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { - const int HOST_OPTIONS = 1000; - const int JSON_OPTIONS = 1001; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS }, - {"help", optional_argument, nullptr, 'h' }, - {"unauth", optional_argument, nullptr, 'u' }, - {"list", optional_argument, nullptr, 'l' }, - {"group", required_argument, nullptr, 'g'}, - {"create", required_argument, nullptr, 'c' }, - {"fieldids", required_argument, nullptr, 'f'}, - {"info", optional_argument, nullptr, 'i' }, - {"delete", required_argument, nullptr, 'd' }, - {"json", optional_argument, nullptr, JSON_OPTIONS }, - { nullptr, 0 , nullptr, 0 } - }; +void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const int JSON_OPTIONS = 1001; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"list", optional_argument, nullptr, 'l'}, + {"group", required_argument, nullptr, 'g'}, + {"create", required_argument, nullptr, 'c'}, + {"fieldids", required_argument, nullptr, 'f'}, + {"info", optional_argument, nullptr, 'i'}, + {"delete", required_argument, nullptr, 'd'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, + {nullptr, 0, nullptr, 0}}; - int option_index = 0; - int opt = 0; + int option_index = 0; + int opt = 0; - while ((opt = getopt_long(argc, argv, "hluif:c:g:d:", - long_options, &option_index)) != -1) { - switch (opt) { - case HOST_OPTIONS: - ip_port_ = optarg; - break; - case JSON_OPTIONS: - set_json_output(true); - break; - case 'h': - field_group_ops_ = FIELD_GROUP_HELP; - return; - case 'u': - use_auth_ = false; - break; - case 'l': - field_group_ops_ = FIELD_GROUP_LIST; - break; - case 'f': - field_ids_ = optarg; - break; - case 'g': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - group_id_ = std::stoi(optarg); - is_group_set_ = true; - break; - case 'c': - field_group_ops_ = FIELD_GROUP_CREATE; - group_name_ = optarg; - break; - case 'i': - field_group_ops_ = FIELD_GROUP_INFO; - break; - case 'd': - field_group_ops_ = FIELD_GROUP_DELETE; - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - group_id_ = std::stoi(optarg); - is_group_set_ = true; - break; - default: - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Unknown command line options"); + while ((opt = getopt_long(argc, argv, "hluif:c:g:d:", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case JSON_OPTIONS: + set_json_output(true); + break; + case 'h': + field_group_ops_ = FIELD_GROUP_HELP; + return; + case 'u': + use_auth_ = false; + break; + case 'l': + field_group_ops_ = FIELD_GROUP_LIST; + break; + case 'f': + field_ids_ = optarg; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); } - } - - if (field_group_ops_ == FIELD_GROUP_UNKNOWN) { + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + case 'c': + field_group_ops_ = FIELD_GROUP_CREATE; + group_name_ = optarg; + break; + case 'i': + field_group_ops_ = FIELD_GROUP_INFO; + break; + case 'd': + field_group_ops_ = FIELD_GROUP_DELETE; + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + default: show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Must specify a valid operations"); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } + } + + if (field_group_ops_ == FIELD_GROUP_UNKNOWN) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Must specify a valid operations"); + } } void RdciFieldGroupSubSystem::show_help() const { - if (is_json_output()) return; - std::cout << " fieldgroup -- Used to create and maintain groups " - << "of field Ids.\n\n"; - std::cout << "Usage\n"; - std::cout << " rdci fieldgroup [--host :port]" - << " [--json] [-u] -l\n"; - std::cout << " rdci fieldgroup [--host :port] [--json]" - << " [-u] -c -f \n"; - std::cout << " rdci fieldgroup [--host :port] [--json] [-u] " - << "-g -i\n"; - std::cout << " rdci fieldgroup [--host :port] [--json] [-u] " + if (is_json_output()) return; + std::cout << " fieldgroup -- Used to create and maintain groups " + << "of field Ids.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci fieldgroup [--host :port]" + << " [--json] [-u] -l\n"; + std::cout << " rdci fieldgroup [--host :port] [--json]" + << " [-u] -c -f \n"; + std::cout << " rdci fieldgroup [--host :port] [--json] [-u] " + << "-g -i\n"; + std::cout << " rdci fieldgroup [--host :port] [--json] [-u] " << "-d \n"; - std::cout << "\nFlags:\n"; - show_common_usage(); - std::cout << " --json " - << "Output using json.\n"; - std::cout << " -l --list " - << "List the field groups that currently exist for a host.\n"; - std::cout << " -g --group groupId " - << "The field group to query on the specified host.\n"; - std::cout << " -c --create groupName " - << "Create a field group on the remote host.\n"; - std::cout << " -f --fieldids fieldIds Comma-separated " - << "list of the field ids to add to a field group\n"; - std::cout << " -i --info " - << "Display the information for the specified group Id\n"; - std::cout << " -d --delete groupId " - << "Delete a field group on the remote host.\n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " --json " + << "Output using json.\n"; + std::cout << " -l --list " + << "List the field groups that currently exist for a host.\n"; + std::cout << " -g --group groupId " + << "The field group to query on the specified host.\n"; + std::cout << " -c --create groupName " + << "Create a field group on the remote host.\n"; + std::cout << " -f --fieldids fieldIds Comma-separated " + << "list of the field ids to add to a field group\n"; + std::cout << " -i --info " + << "Display the information for the specified group Id\n"; + std::cout << " -d --delete groupId " + << "Delete a field group on the remote host.\n"; } - void RdciFieldGroupSubSystem::process() { - rdc_status_t result = RDC_ST_OK; - rdc_field_group_info_t group_info; - uint32_t count = 0; - std::string json_group_ids = "\"field_groups\": ["; - switch (field_group_ops_) { - case FIELD_GROUP_HELP: - show_help(); - break; - case FIELD_GROUP_CREATE: - { - if (group_name_ == "") { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Must specify the group name when create a field group"); - } - std::vector fields = split_string(field_ids_, ','); - rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; - for (uint32_t i = 0; i < fields.size(); i++) { - if (!IsNumber(fields[i])) { - if (!get_field_id_from_name(fields[i], &field_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The field name "+fields[i]+" is not valid"); - } - } else { - field_ids[i] = - static_cast(std::stoi(fields[i])); - } - } - rdc_field_grp_t group_id; - result = rdc_group_field_create(rdc_handle_, fields.size(), - &field_ids[0], group_name_.c_str(), &group_id); - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"field_group_id\": \"" << group_id - <<"\", \"status\": \"ok\""; - } else { - std::cout << "Successfully created a field group" - << " with a group ID " << group_id << std::endl; - return; - } - } - break; - } - case FIELD_GROUP_DELETE: - if (!is_group_set_) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the group id to delete a group"); - } - result = rdc_group_field_destroy(rdc_handle_, group_id_); - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"field_group_id\": \"" << group_id_ - <<"\", \"status\": \"ok\""; - } else { - std::cout << "Successfully deleted the field group " - << group_id_ << std::endl; - } - return; - } - break; - case FIELD_GROUP_LIST: - rdc_field_grp_t group_id_list[RDC_MAX_NUM_FIELD_GROUPS]; - result = rdc_group_field_get_all_ids( - rdc_handle_, group_id_list, &count); - if ( result != RDC_ST_OK) break; + rdc_status_t result = RDC_ST_OK; + rdc_field_group_info_t group_info; + uint32_t count = 0; + std::string json_group_ids = "\"field_groups\": ["; + switch (field_group_ops_) { + case FIELD_GROUP_HELP: + show_help(); + break; + case FIELD_GROUP_CREATE: { + if (group_name_ == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Must specify the group name when create a field group"); + } + std::vector fields = split_string(field_ids_, ','); + rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; + for (uint32_t i = 0; i < fields.size(); i++) { + if (!IsNumber(fields[i])) { + if (!get_field_id_from_name(fields[i], &field_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field name " + fields[i] + " is not valid"); + } + } else { + field_ids[i] = static_cast(std::stoi(fields[i])); + } + } + rdc_field_grp_t group_id; + result = rdc_group_field_create(rdc_handle_, fields.size(), &field_ids[0], + group_name_.c_str(), &group_id); + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"field_group_id\": \"" << group_id << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully created a field group" + << " with a group ID " << group_id << std::endl; + return; + } + } + break; + } + case FIELD_GROUP_DELETE: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the group id to delete a group"); + } + result = rdc_group_field_destroy(rdc_handle_, group_id_); + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"field_group_id\": \"" << group_id_ << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully deleted the field group " << group_id_ << std::endl; + } + return; + } + break; + case FIELD_GROUP_LIST: + rdc_field_grp_t group_id_list[RDC_MAX_NUM_FIELD_GROUPS]; + result = rdc_group_field_get_all_ids(rdc_handle_, group_id_list, &count); + if (result != RDC_ST_OK) break; + if (!is_json_output()) { + std::cout << count << " field group found.\n"; + std::cout << "GroupID\t" + << "GroupName\t" + << "FieldIds\n"; + } + + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_field_get_info(rdc_handle_, group_id_list[i], &group_info); + if (result != RDC_ST_OK) { + throw RdcException(RDC_ST_BAD_PARAMETER, "Fail to get information for field group " + + std::to_string(group_id_list[i])); + } + + if (!is_json_output()) { + std::cout << group_id_list[i] << "\t" << group_info.group_name << "\t\t"; + } else { + json_group_ids += "{\"group_id\": \""; + json_group_ids += std::to_string(group_id_list[i]); + json_group_ids += "\", \"group_name\": \""; + json_group_ids += group_info.group_name; + json_group_ids += "\", \"field_ids\": ["; + } + + for (uint32_t j = 0; j < group_info.count; j++) { + if (!is_json_output()) { + std::cout << group_info.field_ids[j]; + } else { + json_group_ids += std::to_string(group_info.field_ids[j]); + } + if (j < group_info.count - 1) { if (!is_json_output()) { - std::cout << count << " field group found.\n"; - std::cout << "GroupID\t" << "GroupName\t" << "FieldIds\n"; + std::cout << ","; + } else { + json_group_ids += ","; } + } + } + if (!is_json_output()) { + std::cout << std::endl; + } else { + json_group_ids += "]}"; + if (i != count - 1) { + json_group_ids += ","; + } + } + } + if (is_json_output()) { + json_group_ids += "], \"status\": \"ok\""; + std::cout << json_group_ids; + } + break; + case FIELD_GROUP_INFO: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the group id to show field group info"); + } + result = rdc_group_field_get_info(rdc_handle_, group_id_, &group_info); + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"group_name\": \"" << group_info.group_name << "\", \"field_ids\": ["; + } else { + std::cout << "Group name: " << group_info.group_name << std::endl; + std::cout << "Field Ids: "; + } + for (uint32_t i = 0; i < group_info.count; i++) { + if (is_json_output()) { + std::cout << group_info.field_ids[i]; + if (i != group_info.count - 1) { + std::cout << ","; + } + } else { + std::cout << group_info.field_ids[i] << " "; + } + } + if (is_json_output()) { + std::cout << "], \"status\": \"ok\""; + } else { + std::cout << std::endl; + } + return; + } + break; + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + } - for (uint32_t i = 0; i < count; i++) { - result = rdc_group_field_get_info( - rdc_handle_, group_id_list[i], &group_info); - if (result != RDC_ST_OK) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "Fail to get information for field group " + - std::to_string(group_id_list[i])); - } - - if (!is_json_output()) { - std::cout << group_id_list[i] << "\t" - << group_info.group_name << "\t\t"; - } else { - json_group_ids += "{\"group_id\": \""; - json_group_ids += std::to_string(group_id_list[i]); - json_group_ids += "\", \"group_name\": \""; - json_group_ids += group_info.group_name; - json_group_ids += "\", \"field_ids\": ["; - } - - for (uint32_t j = 0; j < group_info.count; j++) { - if (!is_json_output()) { - std::cout << group_info.field_ids[j]; - } else { - json_group_ids += - std::to_string(group_info.field_ids[j]); - } - if ( j < group_info.count -1 ) { - if (!is_json_output()) { - std::cout << ","; - } else { - json_group_ids += ","; - } - } - } - if (!is_json_output()) { - std::cout << std::endl; - } else { - json_group_ids += "]}"; - if (i != count -1) { - json_group_ids += ","; - } - } - } - if (is_json_output()) { - json_group_ids += "], \"status\": \"ok\""; - std::cout << json_group_ids; - } - break; - case FIELD_GROUP_INFO: - if (!is_group_set_) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the group id to show field group info"); - } - result = rdc_group_field_get_info( - rdc_handle_, group_id_, &group_info); - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"group_name\": \"" << group_info.group_name - << "\", \"field_ids\": ["; - } else { - std::cout << "Group name: " << group_info.group_name - << std::endl; - std::cout << "Field Ids: "; - } - for (uint32_t i = 0; i < group_info.count; i++) { - if (is_json_output()) { - std::cout << group_info.field_ids[i]; - if ( i != group_info.count-1 ) { - std::cout << ","; - } - } else { - std::cout << group_info.field_ids[i] << " "; - } - } - if (is_json_output()) { - std::cout << "], \"status\": \"ok\""; - } else { - std::cout << std::endl; - } - return; - } - break; - default: - throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); - } - - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } } - } // namespace rdc } // namespace amd - - diff --git a/projects/rdc/rdci/src/RdciGroupSubSystem.cc b/projects/rdc/rdci/src/RdciGroupSubSystem.cc index 301bbed067..f52259982f 100644 --- a/projects/rdc/rdci/src/RdciGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciGroupSubSystem.cc @@ -20,339 +20,307 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "RdciGroupSubSystem.h" + #include #include + #include "common/rdc_utils.h" -#include "rdc_lib/rdc_common.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdciGroupSubSystem::RdciGroupSubSystem(): - group_ops_(GROUP_UNKNOWN) - , is_group_set_(false) { -} +RdciGroupSubSystem::RdciGroupSubSystem() : group_ops_(GROUP_UNKNOWN), is_group_set_(false) {} -void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { - const int HOST_OPTIONS = 1000; - const int JSON_OPTIONS = 1001; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS }, - {"help", optional_argument, nullptr, 'h' }, - {"unauth", optional_argument, nullptr, 'u' }, - {"list", optional_argument, nullptr, 'l' }, - {"group", required_argument, nullptr, 'g'}, - {"create", required_argument, nullptr, 'c' }, - {"add", required_argument, nullptr, 'a' }, - {"info", optional_argument, nullptr, 'i' }, - {"delete", required_argument, nullptr, 'd' }, - {"json", optional_argument, nullptr, JSON_OPTIONS }, - { nullptr, 0 , nullptr, 0 } - }; +void RdciGroupSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const int JSON_OPTIONS = 1001; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"list", optional_argument, nullptr, 'l'}, + {"group", required_argument, nullptr, 'g'}, + {"create", required_argument, nullptr, 'c'}, + {"add", required_argument, nullptr, 'a'}, + {"info", optional_argument, nullptr, 'i'}, + {"delete", required_argument, nullptr, 'd'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, + {nullptr, 0, nullptr, 0}}; - int option_index = 0; - int opt = 0; + int option_index = 0; + int opt = 0; - while ((opt = getopt_long(argc, argv, "hluic:g:a:d:", - long_options, &option_index)) != -1) { - switch (opt) { - case HOST_OPTIONS: - ip_port_ = optarg; - break; - case JSON_OPTIONS: - set_json_output(true); - break; - case 'h': - group_ops_ = GROUP_HELP; - return; - case 'u': - use_auth_ = false; - break; - case 'l': - group_ops_ = GROUP_LIST; - break; - case 'g': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - group_id_ = std::stoi(optarg); - is_group_set_ = true; - break; - case 'c': - group_ops_ = GROUP_CREATE; - group_name_ = optarg; - break; - case 'a': - // Create may add GPUs as well. - if (group_ops_ != GROUP_CREATE) { - group_ops_ = GROUP_ADD_GPUS; - } - gpu_ids_ = optarg; - break; - case 'i': - group_ops_ = GROUP_INFO; - break; - case 'd': - group_ops_ = GROUP_DELETE; - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - group_id_ = std::stoi(optarg); - is_group_set_ = true; - break; - default: - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Unknown command line options"); + while ((opt = getopt_long(argc, argv, "hluic:g:a:d:", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case JSON_OPTIONS: + set_json_output(true); + break; + case 'h': + group_ops_ = GROUP_HELP; + return; + case 'u': + use_auth_ = false; + break; + case 'l': + group_ops_ = GROUP_LIST; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); } - } - - if (group_ops_ == GROUP_UNKNOWN) { + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + case 'c': + group_ops_ = GROUP_CREATE; + group_name_ = optarg; + break; + case 'a': + // Create may add GPUs as well. + if (group_ops_ != GROUP_CREATE) { + group_ops_ = GROUP_ADD_GPUS; + } + gpu_ids_ = optarg; + break; + case 'i': + group_ops_ = GROUP_INFO; + break; + case 'd': + group_ops_ = GROUP_DELETE; + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + is_group_set_ = true; + break; + default: show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Must specify a valid operations"); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } + } + + if (group_ops_ == GROUP_UNKNOWN) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Must specify a valid operations"); + } } void RdciGroupSubSystem::show_help() const { - if (is_json_output()) return; - std::cout << " group -- Used to create and maintain groups of GPUs.\n\n"; - std::cout << "Usage\n"; - std::cout << " rdci group [--host :port] [--json] [-u] -l\n"; - std::cout << " rdci group [--host :port] [--json] [-u]" - << " -c [-a ]\n"; - std::cout << " rdci group [--host :port] [--json] [-u]" - << " -g [-a ]\n"; - std::cout << " rdci group [--host :port] [--json] [-u] " - << "-g [-i]\n"; - std::cout << " rdci group [--host :port] [--json] [-u] " - << "-d \n"; - std::cout << "\nFlags:\n"; - show_common_usage(); - std::cout << " --json " - << "Output using json.\n"; - std::cout << " -l --list " - << "List the groups that currently exist for a host.\n"; - std::cout << " -g --group groupId " - << "The GPU group to query on the specified host.\n"; - std::cout << " -c --create groupName " - << "Create a group on the remote host.\n"; - std::cout << " -a --add gpuIndexes " - << "Comma-separated list of the GPU indexes to add to the group.\n"; - std::cout << " -i --info " - << "Display the information for the specified group Id\n"; - std::cout << " -d --delete groupId " - << "Delete a group on the remote host.\n"; + if (is_json_output()) return; + std::cout << " group -- Used to create and maintain groups of GPUs.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci group [--host :port] [--json] [-u] -l\n"; + std::cout << " rdci group [--host :port] [--json] [-u]" + << " -c [-a ]\n"; + std::cout << " rdci group [--host :port] [--json] [-u]" + << " -g [-a ]\n"; + std::cout << " rdci group [--host :port] [--json] [-u] " + << "-g [-i]\n"; + std::cout << " rdci group [--host :port] [--json] [-u] " + << "-d \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " --json " + << "Output using json.\n"; + std::cout << " -l --list " + << "List the groups that currently exist for a host.\n"; + std::cout << " -g --group groupId " + << "The GPU group to query on the specified host.\n"; + std::cout << " -c --create groupName " + << "Create a group on the remote host.\n"; + std::cout << " -a --add gpuIndexes " + << "Comma-separated list of the GPU indexes to add to the group.\n"; + std::cout << " -i --info " + << "Display the information for the specified group Id\n"; + std::cout << " -d --delete groupId " + << "Delete a group on the remote host.\n"; } - void RdciGroupSubSystem::process() { - rdc_status_t result = RDC_ST_OK; - std::vector gpu_ids; - rdc_group_info_t group_info; - uint32_t count = 0; - std::string json_group_ids = "\"gpu_groups\": ["; - switch (group_ops_) { - case GROUP_HELP: - show_help(); - break; - case GROUP_CREATE: - if (group_name_ == "") { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Must specify the group name when create a group"); - } - rdc_gpu_group_t group_id; - result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, - group_name_.c_str(), &group_id); - if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to create group " - + group_name_); - } + rdc_status_t result = RDC_ST_OK; + std::vector gpu_ids; + rdc_group_info_t group_info; + uint32_t count = 0; + std::string json_group_ids = "\"gpu_groups\": ["; + switch (group_ops_) { + case GROUP_HELP: + show_help(); + break; + case GROUP_CREATE: + if (group_name_ == "") { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Must specify the group name when create a group"); + } + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, group_name_.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create group " + group_name_); + } - gpu_ids = split_string(gpu_ids_, ','); - for (uint32_t i = 0; i < gpu_ids.size(); i++) { - if (!IsNumber(gpu_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The GPU Id "+gpu_ids[i]+" needs to be a number"); - } - result = rdc_group_gpu_add(rdc_handle_, - group_id, std::stoi(gpu_ids[i])); - if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to add GPU " - + gpu_ids[i] + " to the group"); - } - } + gpu_ids = split_string(gpu_ids_, ','); + for (uint32_t i = 0; i < gpu_ids.size(); i++) { + if (!IsNumber(gpu_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU Id " + gpu_ids[i] + " needs to be a number"); + } + result = rdc_group_gpu_add(rdc_handle_, group_id, std::stoi(gpu_ids[i])); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to add GPU " + gpu_ids[i] + " to the group"); + } + } - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"group_id\": \"" << group_id - <<"\", \"status\": \"ok\""; - } else { - std::cout << "Successfully created group with a group ID " - << group_id << std::endl; - } - return; - } - break; - case GROUP_DELETE: - if (!is_group_set_) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the group id to delete a group"); - } - result = rdc_group_gpu_destroy(rdc_handle_, group_id_); - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"group_id\": \"" << group_id_ - <<"\", \"status\": \"ok\""; - } else { - std::cout << "Successfully deleted the group " - << group_id_ << std::endl; - } - return; - } - break; - case GROUP_LIST: - rdc_gpu_group_t group_id_list[RDC_MAX_NUM_GROUPS]; - result = rdc_group_get_all_ids(rdc_handle_, group_id_list, &count); - if ( result != RDC_ST_OK) break; + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"group_id\": \"" << group_id << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully created group with a group ID " << group_id << std::endl; + } + return; + } + break; + case GROUP_DELETE: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the group id to delete a group"); + } + result = rdc_group_gpu_destroy(rdc_handle_, group_id_); + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully deleted the group " << group_id_ << std::endl; + } + return; + } + break; + case GROUP_LIST: + rdc_gpu_group_t group_id_list[RDC_MAX_NUM_GROUPS]; + result = rdc_group_get_all_ids(rdc_handle_, group_id_list, &count); + if (result != RDC_ST_OK) break; - if (!is_json_output()) { - std::cout << count << " group found.\n"; - std::cout << "GroupID\t" << "GroupName\t" << "GPUIndex\n"; - } - for (uint32_t i = 0; i < count; i++) { - result = rdc_group_gpu_get_info(rdc_handle_, - group_id_list[i], &group_info); - if (result != RDC_ST_OK) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "Fail to get information for group " - + std::to_string(group_id_list[i])); - } + if (!is_json_output()) { + std::cout << count << " group found.\n"; + std::cout << "GroupID\t" + << "GroupName\t" + << "GPUIndex\n"; + } + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_get_info(rdc_handle_, group_id_list[i], &group_info); + if (result != RDC_ST_OK) { + throw RdcException(RDC_ST_BAD_PARAMETER, "Fail to get information for group " + + std::to_string(group_id_list[i])); + } - if (!is_json_output()) { - std::cout << group_id_list[i] << "\t" - << group_info.group_name << "\t\t"; - } else { - json_group_ids += "{\"group_id\": \""; - json_group_ids += std::to_string(group_id_list[i]); - json_group_ids += "\", \"group_name\": \""; - json_group_ids += group_info.group_name; - json_group_ids += "\", \"gpu_indexes\": ["; - } - for (uint32_t j = 0; j < group_info.count; j++) { - if (!is_json_output()) { - std::cout << group_info.entity_ids[j]; - } else { - json_group_ids += - std::to_string(group_info.entity_ids[j]); - } - if (j < group_info.count -1) { - if (!is_json_output()) { - std::cout << ","; - } else { - json_group_ids += ","; - } - } - } - if (!is_json_output()) { - std::cout << std::endl; - } else { - json_group_ids += "]}"; - if (i != count -1) { - json_group_ids += ","; - } - } - } - if (is_json_output()) { - json_group_ids += "], \"status\": \"ok\""; - std::cout << json_group_ids; - } - break; - case GROUP_ADD_GPUS: - if (!is_group_set_) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the group id to add a group"); + if (!is_json_output()) { + std::cout << group_id_list[i] << "\t" << group_info.group_name << "\t\t"; + } else { + json_group_ids += "{\"group_id\": \""; + json_group_ids += std::to_string(group_id_list[i]); + json_group_ids += "\", \"group_name\": \""; + json_group_ids += group_info.group_name; + json_group_ids += "\", \"gpu_indexes\": ["; + } + for (uint32_t j = 0; j < group_info.count; j++) { + if (!is_json_output()) { + std::cout << group_info.entity_ids[j]; + } else { + json_group_ids += std::to_string(group_info.entity_ids[j]); + } + if (j < group_info.count - 1) { + if (!is_json_output()) { + std::cout << ","; + } else { + json_group_ids += ","; } + } + } + if (!is_json_output()) { + std::cout << std::endl; + } else { + json_group_ids += "]}"; + if (i != count - 1) { + json_group_ids += ","; + } + } + } + if (is_json_output()) { + json_group_ids += "], \"status\": \"ok\""; + std::cout << json_group_ids; + } + break; + case GROUP_ADD_GPUS: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the group id to add a group"); + } - gpu_ids = split_string(gpu_ids_, ','); - for (uint32_t i = 0; i < gpu_ids.size(); i++) { - if (!IsNumber(gpu_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The GPU Id "+gpu_ids[i]+" needs to be a number"); - } - result = rdc_group_gpu_add(rdc_handle_, - group_id_, std::stoi(gpu_ids[i])); - if (result != RDC_ST_OK) { - throw RdcException(result, "Fail to add GPU " - + gpu_ids[i] + " to the group"); - } + gpu_ids = split_string(gpu_ids_, ','); + for (uint32_t i = 0; i < gpu_ids.size(); i++) { + if (!IsNumber(gpu_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU Id " + gpu_ids[i] + " needs to be a number"); + } + result = rdc_group_gpu_add(rdc_handle_, group_id_, std::stoi(gpu_ids[i])); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to add GPU " + gpu_ids[i] + " to the group"); + } + } + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully added the GPU " << gpu_ids_ << " to group " << group_id_ + << std::endl; + } + return; + } + break; + case GROUP_INFO: + if (!is_group_set_) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the group id to show group info"); + } + result = rdc_group_gpu_get_info(rdc_handle_, group_id_, &group_info); + if (result == RDC_ST_OK) { + if (is_json_output()) { + std::cout << "\"group_name\": \"" << group_info.group_name << "\", \"gpu_indexes\": ["; + } else { + std::cout << "Group name: " << group_info.group_name << std::endl; + std::cout << "Gpu indexes: "; + } + for (uint32_t i = 0; i < group_info.count; i++) { + if (is_json_output()) { + std::cout << group_info.entity_ids[i]; + if (i != group_info.count - 1) { + std::cout << ","; } - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"group_id\": \"" << group_id_ - <<"\", \"status\": \"ok\""; - } else { - std::cout << "Successfully added the GPU " << gpu_ids_ - << " to group "<< group_id_ << std::endl; - } - return; - } - break; - case GROUP_INFO: - if (!is_group_set_) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the group id to show group info"); - } - result = rdc_group_gpu_get_info(rdc_handle_, - group_id_, &group_info); - if (result == RDC_ST_OK) { - if (is_json_output()) { - std::cout << "\"group_name\": \"" << group_info.group_name - << "\", \"gpu_indexes\": ["; - } else { - std::cout << "Group name: " - << group_info.group_name << std::endl; - std::cout << "Gpu indexes: "; - } - for (uint32_t i = 0; i < group_info.count; i++) { - if (is_json_output()) { - std::cout << group_info.entity_ids[i]; - if ( i != group_info.count-1 ) { - std::cout << ","; - } - } else { - std::cout << group_info.entity_ids[i] << " "; - } - } - if (is_json_output()) { - std::cout << "], \"status\": \"ok\""; - } else { - std::cout << std::endl; - } - return; - } - break; - default: - throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); - } + } else { + std::cout << group_info.entity_ids[i] << " "; + } + } + if (is_json_output()) { + std::cout << "], \"status\": \"ok\""; + } else { + std::cout << std::endl; + } + return; + } + break; + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + } - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } } - } // namespace rdc } // namespace amd - - diff --git a/projects/rdc/rdci/src/RdciStatsSubSystem.cc b/projects/rdc/rdci/src/RdciStatsSubSystem.cc index e47904abe8..95ac1783c5 100644 --- a/projects/rdc/rdci/src/RdciStatsSubSystem.cc +++ b/projects/rdc/rdci/src/RdciStatsSubSystem.cc @@ -20,408 +20,347 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "RdciStatsSubSystem.h" + #include -#include #include +#include + #include -#include #include -#include "rdc_lib/rdc_common.h" +#include + #include "common/rdc_utils.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdciStatsSubSystem::RdciStatsSubSystem() { -} +RdciStatsSubSystem::RdciStatsSubSystem() {} -RdciStatsSubSystem::~RdciStatsSubSystem() { -} +RdciStatsSubSystem::~RdciStatsSubSystem() {} +void RdciStatsSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const int JSON_OPTIONS = 1001; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"jstart", required_argument, nullptr, 's'}, + {"jstop", required_argument, nullptr, 'x'}, + {"job", required_argument, nullptr, 'j'}, + {"jremove", required_argument, nullptr, 'r'}, + {"jremoveall", optional_argument, nullptr, 'a'}, + {"verbose", optional_argument, nullptr, 'v'}, + {"group", required_argument, nullptr, 'g'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, + {nullptr, 0, nullptr, 0}}; -void RdciStatsSubSystem::parse_cmd_opts(int argc, char ** argv) { - const int HOST_OPTIONS = 1000; - const int JSON_OPTIONS = 1001; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS }, - {"help", optional_argument, nullptr, 'h' }, - {"unauth", optional_argument, nullptr, 'u' }, - {"jstart", required_argument, nullptr, 's' }, - {"jstop", required_argument, nullptr, 'x' }, - {"job", required_argument, nullptr, 'j' }, - {"jremove", required_argument, nullptr, 'r'}, - {"jremoveall", optional_argument, nullptr, 'a' }, - {"verbose", optional_argument, nullptr, 'v'}, - {"group", required_argument, nullptr, 'g'}, - {"json", optional_argument, nullptr, JSON_OPTIONS}, - { nullptr, 0 , nullptr, 0 } - }; + bool is_group_id_set = false; + int option_index = 0; + int opt = 0; - bool is_group_id_set = false; - int option_index = 0; - int opt = 0; - - while ((opt = getopt_long(argc, argv, "huvas:x:j:r:g:", - long_options, &option_index)) != -1) { - switch (opt) { - case HOST_OPTIONS: - ip_port_ = optarg; - break; - case JSON_OPTIONS: - set_json_output(true); - break; - case 'h': - stats_ops_ = STATS_HELP; - return; - case 'u': - use_auth_ = false; - break; - case 's': - stats_ops_ = STATS_START_RECORDING; - job_id_ = optarg; - break; - case 'g': - if (!IsNumber(optarg)) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "The group id needs to be a number"); - } - group_id_ = std::stoi(optarg); - is_group_id_set = true; - break; - case 'x': - stats_ops_ = STATS_STOP_RECORDING; - job_id_ = optarg; - break; - case 'j': - stats_ops_ = STATS_DISPLAY; - job_id_ = optarg; - break; - case 'v': - is_verbose_ = true; - break; - case 'r': - stats_ops_ = STATS_REMOVE; - job_id_ = optarg; - break; - case 'a': - stats_ops_ = STATS_REMOVE_ALL; - break; - default: - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Unknown command line options"); + while ((opt = getopt_long(argc, argv, "huvas:x:j:r:g:", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case JSON_OPTIONS: + set_json_output(true); + break; + case 'h': + stats_ops_ = STATS_HELP; + return; + case 'u': + use_auth_ = false; + break; + case 's': + stats_ops_ = STATS_START_RECORDING; + job_id_ = optarg; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); } + group_id_ = std::stoi(optarg); + is_group_id_set = true; + break; + case 'x': + stats_ops_ = STATS_STOP_RECORDING; + job_id_ = optarg; + break; + case 'j': + stats_ops_ = STATS_DISPLAY; + job_id_ = optarg; + break; + case 'v': + is_verbose_ = true; + break; + case 'r': + stats_ops_ = STATS_REMOVE; + job_id_ = optarg; + break; + case 'a': + stats_ops_ = STATS_REMOVE_ALL; + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } + } - if (stats_ops_ == STATS_START_RECORDING - && is_group_id_set == false) { - show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, - "Need to specify the group id to start recording"); - } + if (stats_ops_ == STATS_START_RECORDING && is_group_id_set == false) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the group id to start recording"); + } } void RdciStatsSubSystem::show_help() const { - if (is_json_output()) return; - std::cout << " stats -- Used to view job statistics.\n\n"; - std::cout << "Usage\n"; - std::cout << " rdci stats [--host :port] [-u] [--json] " + if (is_json_output()) return; + std::cout << " stats -- Used to view job statistics.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci stats [--host :port] [-u] [--json] " << "-s -g \n"; - std::cout << " rdci stats [--host :port] [-u] [--json] " + std::cout << " rdci stats [--host :port] [-u] [--json] " << "-x \n"; - std::cout << " rdci stats [--host :port] [-u] [--json] [-v] " - << "-j \n"; - std::cout << " rdci stats [--host :port] [-u] [--json] " - << "-r \n"; - std::cout << " rdci stats [--host :port] [-u] [--json] -a\n"; - std::cout << "\nFlags:\n"; - show_common_usage(); - std::cout << " --json " - << "Output using json.\n"; - std::cout << " -s --jstart Start recording " - << "job statistics.\n"; - std::cout << " -g --group-id The GPU group to query " - << "on the specified host.\n"; - std::cout << " -x --jstop Stop recording " - << "job statistics.\n"; - std::cout << " -j --job Display " - << "job statistics.\n"; - std::cout << " -v --verbose Show job information " - << "for each GPU.\n"; - std::cout << " -r --jremove Remove " - << "job statistics.\n"; - std::cout << " -a --jremoveall Remove " - << "all job statistics.\n"; + std::cout << " rdci stats [--host :port] [-u] [--json] [-v] " + << "-j \n"; + std::cout << " rdci stats [--host :port] [-u] [--json] " + << "-r \n"; + std::cout << " rdci stats [--host :port] [-u] [--json] -a\n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " --json " + << "Output using json.\n"; + std::cout << " -s --jstart Start recording " + << "job statistics.\n"; + std::cout << " -g --group-id The GPU group to query " + << "on the specified host.\n"; + std::cout << " -x --jstop Stop recording " + << "job statistics.\n"; + std::cout << " -j --job Display " + << "job statistics.\n"; + std::cout << " -v --verbose Show job information " + << "for each GPU.\n"; + std::cout << " -r --jremove Remove " + << "job statistics.\n"; + std::cout << " -a --jremoveall Remove " + << "all job statistics.\n"; } -void RdciStatsSubSystem::show_job_stats_json( - const rdc_gpu_usage_info_t& gpu_info) const { - std::cout << "\"start_time\": " << gpu_info.start_time << ","; - std::cout << "\"end_time\": " << gpu_info.end_time << ","; - std::cout << "\"execution_time\": " << - (gpu_info.end_time-gpu_info.start_time) << ","; - std::cout << "\"energy_consumed\": " << gpu_info.energy_consumed << ","; +void RdciStatsSubSystem::show_job_stats_json(const rdc_gpu_usage_info_t& gpu_info) const { + std::cout << "\"start_time\": " << gpu_info.start_time << ","; + std::cout << "\"end_time\": " << gpu_info.end_time << ","; + std::cout << "\"execution_time\": " << (gpu_info.end_time - gpu_info.start_time) << ","; + std::cout << "\"energy_consumed\": " << gpu_info.energy_consumed << ","; - std::cout << "\"power_usage_max\": " - << gpu_info.power_usage.max_value << ","; - std::cout << "\"power_usage_min\": " - << gpu_info.power_usage.min_value << ","; - std::cout << "\"power_usage_avg\": " - << gpu_info.power_usage.average << ","; - std::cout << "\"power_usage_stanard_deviation\": " - << gpu_info.power_usage.standard_deviation << ","; + std::cout << "\"power_usage_max\": " << gpu_info.power_usage.max_value << ","; + std::cout << "\"power_usage_min\": " << gpu_info.power_usage.min_value << ","; + std::cout << "\"power_usage_avg\": " << gpu_info.power_usage.average << ","; + std::cout << "\"power_usage_stanard_deviation\": " << gpu_info.power_usage.standard_deviation + << ","; - std::cout << "\"gpu_clock_max\": " - << gpu_info.gpu_clock.max_value << ","; - std::cout << "\"gpu_clock_min\": " - << gpu_info.gpu_clock.min_value << ","; - std::cout << "\"gpu_clock_avg\": " - << gpu_info.gpu_clock.average << ","; - std::cout << "\"gpu_clock_stanard_deviation\": " - << gpu_info.gpu_clock.standard_deviation << ","; + std::cout << "\"gpu_clock_max\": " << gpu_info.gpu_clock.max_value << ","; + std::cout << "\"gpu_clock_min\": " << gpu_info.gpu_clock.min_value << ","; + std::cout << "\"gpu_clock_avg\": " << gpu_info.gpu_clock.average << ","; + std::cout << "\"gpu_clock_stanard_deviation\": " << gpu_info.gpu_clock.standard_deviation << ","; - std::cout << "\"memory_clock_max\": " - << gpu_info.memory_clock.max_value << ","; - std::cout << "\"memory_clock_min\": " - << gpu_info.memory_clock.min_value << ","; - std::cout << "\"memory_clock_avg\": " - << gpu_info.memory_clock.average << ","; - std::cout << "\"memory_clock_stanard_deviation\": " - << gpu_info.memory_clock.standard_deviation << ","; + std::cout << "\"memory_clock_max\": " << gpu_info.memory_clock.max_value << ","; + std::cout << "\"memory_clock_min\": " << gpu_info.memory_clock.min_value << ","; + std::cout << "\"memory_clock_avg\": " << gpu_info.memory_clock.average << ","; + std::cout << "\"memory_clock_stanard_deviation\": " << gpu_info.memory_clock.standard_deviation + << ","; - std::cout << "\"gpu_utilization_max\": " - << gpu_info.gpu_utilization.max_value << ","; - std::cout << "\"gpu_utilization_min\": " - << gpu_info.gpu_utilization.min_value << ","; - std::cout << "\"gpu_utilization_avg\": " - << gpu_info.gpu_utilization.average << ","; - std::cout << "\"gpu_utilization_deviation\": " - << gpu_info.gpu_utilization.standard_deviation << ","; + std::cout << "\"gpu_utilization_max\": " << gpu_info.gpu_utilization.max_value << ","; + std::cout << "\"gpu_utilization_min\": " << gpu_info.gpu_utilization.min_value << ","; + std::cout << "\"gpu_utilization_avg\": " << gpu_info.gpu_utilization.average << ","; + std::cout << "\"gpu_utilization_deviation\": " << gpu_info.gpu_utilization.standard_deviation + << ","; - std::cout << "\"max_gpu_memory_used\": " - << gpu_info.max_gpu_memory_used << ","; + std::cout << "\"max_gpu_memory_used\": " << gpu_info.max_gpu_memory_used << ","; - std::cout << "\"memory_utilization_max\": " - << gpu_info.memory_utilization.max_value << ","; - std::cout << "\"memory_utilization_min\": " - << gpu_info.memory_utilization.min_value << ","; - std::cout << "\"memory_utilization_avg\": " - << gpu_info.memory_utilization.average << ","; - std::cout << "\"memory_utilization_stanard_deviation\": " + std::cout << "\"memory_utilization_max\": " << gpu_info.memory_utilization.max_value << ","; + std::cout << "\"memory_utilization_min\": " << gpu_info.memory_utilization.min_value << ","; + std::cout << "\"memory_utilization_avg\": " << gpu_info.memory_utilization.average << ","; + std::cout << "\"memory_utilization_stanard_deviation\": " << gpu_info.memory_utilization.standard_deviation << ","; - std::cout << "\"gpu_temperature_max\": " - << gpu_info.gpu_temperature.max_value << ","; - std::cout << "\"gpu_temperature_min\": " - << gpu_info.gpu_temperature.min_value << ","; - std::cout << "\"gpu_temperature_avg\": " - << gpu_info.gpu_temperature.average << ","; - std::cout << "\"gpu_temperature_stanard_deviation\": " + std::cout << "\"gpu_temperature_max\": " << gpu_info.gpu_temperature.max_value << ","; + std::cout << "\"gpu_temperature_min\": " << gpu_info.gpu_temperature.min_value << ","; + std::cout << "\"gpu_temperature_avg\": " << gpu_info.gpu_temperature.average << ","; + std::cout << "\"gpu_temperature_stanard_deviation\": " << gpu_info.gpu_temperature.standard_deviation << ","; - std::cout << "\"pcie_rx_max\": " - << gpu_info.pcie_rx.max_value << ","; - std::cout << "\"pcie_rx_min\": " - << gpu_info.pcie_rx.min_value << ","; - std::cout << "\"pcie_rx_avg\": " - << gpu_info.pcie_rx.average << ","; - std::cout << "\"pcie_rx_stanard_deviation\": " - << gpu_info.pcie_rx.standard_deviation << ","; + std::cout << "\"pcie_rx_max\": " << gpu_info.pcie_rx.max_value << ","; + std::cout << "\"pcie_rx_min\": " << gpu_info.pcie_rx.min_value << ","; + std::cout << "\"pcie_rx_avg\": " << gpu_info.pcie_rx.average << ","; + std::cout << "\"pcie_rx_stanard_deviation\": " << gpu_info.pcie_rx.standard_deviation << ","; - std::cout << "\"pcie_tx_max\": " - << gpu_info.pcie_tx.max_value << ","; - std::cout << "\"pcie_tx_min\": " - << gpu_info.pcie_tx.min_value << ","; - std::cout << "\"pcie_tx_avg\": " - << gpu_info.pcie_tx.average << ","; - std::cout << "\"pcie_tx_stanard_deviation\": " - << gpu_info.pcie_tx.standard_deviation << ","; + std::cout << "\"pcie_tx_max\": " << gpu_info.pcie_tx.max_value << ","; + std::cout << "\"pcie_tx_min\": " << gpu_info.pcie_tx.min_value << ","; + std::cout << "\"pcie_tx_avg\": " << gpu_info.pcie_tx.average << ","; + std::cout << "\"pcie_tx_stanard_deviation\": " << gpu_info.pcie_tx.standard_deviation << ","; - std::cout << "\"ecc_correct\": " << gpu_info.ecc_correct << ","; - std::cout << "\"ecc_uncorrect\": " << gpu_info.ecc_uncorrect; + std::cout << "\"ecc_correct\": " << gpu_info.ecc_correct << ","; + std::cout << "\"ecc_uncorrect\": " << gpu_info.ecc_uncorrect; } -void RdciStatsSubSystem::show_job_stats( - const rdc_gpu_usage_info_t& gpu_info) const { - std::cout << "|------- Execution Stats ----------" - << "+------------------------------------\n"; - std::cout << "| Start Time | " - << std::put_time(std::gmtime(reinterpret_cast - (&gpu_info.start_time)), "%c %Z") << "\n"; - std::cout << "| End Time | " - << std::put_time(std::gmtime(reinterpret_cast - (&gpu_info.end_time)), "%c %Z") << "\n"; - std::cout << "| Total Execution Time (sec) | " - << (gpu_info.end_time-gpu_info.start_time) << "\n"; - std::cout << "+------- Performance Stats --------" - << "+------------------------------------\n"; - std::cout << "| Energy Consumed (Joules) | " - << gpu_info.energy_consumed << "\n"; - std::cout << "| Power Usage (Watts) | " << "Max: " - << gpu_info.power_usage.max_value<< " Min: "<< - gpu_info.power_usage.min_value << " Avg: " - << gpu_info.power_usage.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.power_usage.standard_deviation << "\n"; - std::cout << "| GPU Clock (MHz) | " << "Max: " - << gpu_info.gpu_clock.max_value << " Min: " << - gpu_info.gpu_clock.min_value << " Avg: " - << gpu_info.gpu_clock.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.gpu_clock.standard_deviation << "\n"; - std::cout << "| Memory Clock (MHz) | " << "Max: " - << gpu_info.memory_clock.max_value << " Min: " << - gpu_info.memory_clock.min_value << " Avg: " - << gpu_info.memory_clock.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.memory_clock.standard_deviation << "\n"; - std::cout << "| GPU Utilization (%) | " << "Max: " - << gpu_info.gpu_utilization.max_value <<" Min: " << - gpu_info.gpu_utilization.min_value << " Avg: " << - gpu_info.gpu_utilization.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.gpu_utilization.standard_deviation << "\n"; - std::cout << "| Max GPU Memory Used (bytes) | " << - gpu_info.max_gpu_memory_used << "\n"; - std::cout << "| Memory Utilization (%) | " - << "Max: " << gpu_info.memory_utilization.max_value - <<" Min: "<< gpu_info.memory_utilization.min_value - << " Avg: " << gpu_info.memory_utilization.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.memory_utilization.standard_deviation << "\n"; - std::cout << "| GPU Temperature (Celsius) | " - << "Max: " << gpu_info.gpu_temperature.max_value - <<" Min: "<< gpu_info.gpu_temperature.min_value - << " Avg: " << gpu_info.gpu_temperature.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.gpu_temperature.standard_deviation << "\n"; - std::cout << "| PCIe Rx Bandwidth (megabytes) | " - << "Max: " << gpu_info.pcie_rx.max_value - <<" Min: "<< gpu_info.pcie_rx.min_value - << " Avg: " << gpu_info.pcie_rx.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.pcie_rx.standard_deviation << "\n"; - std::cout << "| PCIe Tx Bandwidth (megabytes) | " - << "Max: " << gpu_info.pcie_tx.max_value - <<" Min: "<< gpu_info.pcie_tx.min_value - << " Avg: " << gpu_info.pcie_tx.average << " SD: " - << std::fixed << std::setprecision(2) - << gpu_info.pcie_tx.standard_deviation << "\n"; - std::cout << "| Correctable ECC Errors | " - << gpu_info.ecc_correct << "\n"; - std::cout << "| Uncorrectable ECC Errors | " - << gpu_info.ecc_uncorrect << "\n"; - std::cout << "+----------------------------------" - << "+------------------------------------\n"; +void RdciStatsSubSystem::show_job_stats(const rdc_gpu_usage_info_t& gpu_info) const { + std::cout << "|------- Execution Stats ----------" + << "+------------------------------------\n"; + std::cout << "| Start Time | " + << std::put_time(std::gmtime(reinterpret_cast(&gpu_info.start_time)), + "%c %Z") + << "\n"; + std::cout << "| End Time | " + << std::put_time(std::gmtime(reinterpret_cast(&gpu_info.end_time)), + "%c %Z") + << "\n"; + std::cout << "| Total Execution Time (sec) | " << (gpu_info.end_time - gpu_info.start_time) + << "\n"; + std::cout << "+------- Performance Stats --------" + << "+------------------------------------\n"; + std::cout << "| Energy Consumed (Joules) | " << gpu_info.energy_consumed << "\n"; + std::cout << "| Power Usage (Watts) | " + << "Max: " << gpu_info.power_usage.max_value + << " Min: " << gpu_info.power_usage.min_value + << " Avg: " << gpu_info.power_usage.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.power_usage.standard_deviation << "\n"; + std::cout << "| GPU Clock (MHz) | " + << "Max: " << gpu_info.gpu_clock.max_value << " Min: " << gpu_info.gpu_clock.min_value + << " Avg: " << gpu_info.gpu_clock.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.gpu_clock.standard_deviation << "\n"; + std::cout << "| Memory Clock (MHz) | " + << "Max: " << gpu_info.memory_clock.max_value + << " Min: " << gpu_info.memory_clock.min_value + << " Avg: " << gpu_info.memory_clock.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.memory_clock.standard_deviation << "\n"; + std::cout << "| GPU Utilization (%) | " + << "Max: " << gpu_info.gpu_utilization.max_value + << " Min: " << gpu_info.gpu_utilization.min_value + << " Avg: " << gpu_info.gpu_utilization.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.gpu_utilization.standard_deviation << "\n"; + std::cout << "| Max GPU Memory Used (bytes) | " << gpu_info.max_gpu_memory_used << "\n"; + std::cout << "| Memory Utilization (%) | " + << "Max: " << gpu_info.memory_utilization.max_value + << " Min: " << gpu_info.memory_utilization.min_value + << " Avg: " << gpu_info.memory_utilization.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.memory_utilization.standard_deviation << "\n"; + std::cout << "| GPU Temperature (Celsius) | " + << "Max: " << gpu_info.gpu_temperature.max_value + << " Min: " << gpu_info.gpu_temperature.min_value + << " Avg: " << gpu_info.gpu_temperature.average << " SD: " << std::fixed + << std::setprecision(2) << gpu_info.gpu_temperature.standard_deviation << "\n"; + std::cout << "| PCIe Rx Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_rx.max_value << " Min: " << gpu_info.pcie_rx.min_value + << " Avg: " << gpu_info.pcie_rx.average << " SD: " << std::fixed << std::setprecision(2) + << gpu_info.pcie_rx.standard_deviation << "\n"; + std::cout << "| PCIe Tx Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_tx.max_value << " Min: " << gpu_info.pcie_tx.min_value + << " Avg: " << gpu_info.pcie_tx.average << " SD: " << std::fixed << std::setprecision(2) + << gpu_info.pcie_tx.standard_deviation << "\n"; + std::cout << "| Correctable ECC Errors | " << gpu_info.ecc_correct << "\n"; + std::cout << "| Uncorrectable ECC Errors | " << gpu_info.ecc_uncorrect << "\n"; + std::cout << "+----------------------------------" + << "+------------------------------------\n"; } void RdciStatsSubSystem::process() { - if (stats_ops_ == STATS_HELP || - stats_ops_ == STATS_UNKNOWN) { - show_help(); - return; + if (stats_ops_ == STATS_HELP || stats_ops_ == STATS_UNKNOWN) { + show_help(); + return; + } + + rdc_status_t result; + if (stats_ops_ == STATS_START_RECORDING) { + // Record job every 1 second + result = + rdc_job_start_stats(rdc_handle_, group_id_, const_cast(job_id_.c_str()), 1000000); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + if (is_json_output()) { + std::cout << "\"job_id\": \"" << job_id_ << "\", \"group_id\": \"" << group_id_ + << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully started recording job " << job_id_ << " with a group ID " + << group_id_ << std::endl; + } + return; + } + + if (stats_ops_ == STATS_STOP_RECORDING) { + result = rdc_job_stop_stats(rdc_handle_, const_cast(job_id_.c_str())); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + if (is_json_output()) { + std::cout << "\"job_id\": \"" << job_id_ << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully stopped recording job " << job_id_ << std::endl; + } + return; + } + + if (stats_ops_ == STATS_DISPLAY) { + rdc_job_info_t job_info; + result = rdc_job_get_stats(rdc_handle_, const_cast(job_id_.c_str()), &job_info); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); } - rdc_status_t result; - if (stats_ops_ == STATS_START_RECORDING) { - // Record job every 1 second - result = rdc_job_start_stats(rdc_handle_, group_id_, - const_cast(job_id_.c_str()), 1000000); - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } - if (is_json_output()) { - std::cout << "\"job_id\": \"" << job_id_ << "\", \"group_id\": \"" - << group_id_ <<"\", \"status\": \"ok\""; - } else { - std::cout << "Successfully started recording job " - << job_id_ << " with a group ID " << group_id_ << std::endl; - } - return; + if (!is_json_output()) { + std::cout << "| Summary \n"; + show_job_stats(job_info.summary); + } else { + std::cout << "\"job_summary\" : {"; + show_job_stats_json(job_info.summary); + std::cout << "}"; } - - if (stats_ops_ == STATS_STOP_RECORDING) { - result = rdc_job_stop_stats(rdc_handle_, - const_cast(job_id_.c_str())); - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } - if (is_json_output()) { - std::cout << "\"job_id\": \"" << job_id_ - << "\", \"status\": \"ok\""; - } else { - std::cout << "Successfully stopped recording job " - << job_id_ << std::endl; - } - return; + if (is_verbose_ == false) { + return; } - - if (stats_ops_ == STATS_DISPLAY) { - rdc_job_info_t job_info; - result = rdc_job_get_stats(rdc_handle_, - const_cast(job_id_.c_str()), &job_info); - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } - - if (!is_json_output()) { - std::cout << "| Summary \n"; - show_job_stats(job_info.summary); - } else { - std::cout << "\"job_summary\" : {"; - show_job_stats_json(job_info.summary); - std::cout << "}"; - } - if (is_verbose_ == false) { - return; - } - for (uint32_t i = 0; i < job_info.num_gpus; i++) { - if (!is_json_output()) { - std::cout << "| GPU " << i << "\n"; - show_job_stats(job_info.gpus[i]); - } else { - std:: cout << ", \"gpu_" << i << "\": {"; - show_job_stats_json(job_info.gpus[i]); - std::cout << "}"; - } - } - return; + for (uint32_t i = 0; i < job_info.num_gpus; i++) { + if (!is_json_output()) { + std::cout << "| GPU " << i << "\n"; + show_job_stats(job_info.gpus[i]); + } else { + std::cout << ", \"gpu_" << i << "\": {"; + show_job_stats_json(job_info.gpus[i]); + std::cout << "}"; + } } + return; + } - if (stats_ops_ == STATS_REMOVE) { - result = rdc_job_remove(rdc_handle_, - const_cast(job_id_.c_str())); - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } - if (is_json_output()) { - std::cout << "\"job_id\": \"" << job_id_ - << "\", \"status\": \"ok\""; - } else { - std::cout << "Successfully removed job " - << job_id_ << std::endl; - } - return; + if (stats_ops_ == STATS_REMOVE) { + result = rdc_job_remove(rdc_handle_, const_cast(job_id_.c_str())); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); } + if (is_json_output()) { + std::cout << "\"job_id\": \"" << job_id_ << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully removed job " << job_id_ << std::endl; + } + return; + } - if (stats_ops_ == STATS_REMOVE_ALL) { - result = rdc_job_remove_all(rdc_handle_); - if (result != RDC_ST_OK) { - throw RdcException(result, rdc_status_string(result)); - } - if (is_json_output()) { - std::cout << "\"status\": \"ok\""; - } else { - std::cout << "Successfully removed all jobs\n"; - } - return; + if (stats_ops_ == STATS_REMOVE_ALL) { + result = rdc_job_remove_all(rdc_handle_); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); } + if (is_json_output()) { + std::cout << "\"status\": \"ok\""; + } else { + std::cout << "Successfully removed all jobs\n"; + } + return; + } } } // namespace rdc } // namespace amd - - diff --git a/projects/rdc/rdci/src/RdciSubSystem.cc b/projects/rdc/rdci/src/RdciSubSystem.cc index 0d4b03fb3a..a19e8a2c24 100644 --- a/projects/rdc/rdci/src/RdciSubSystem.cc +++ b/projects/rdc/rdci/src/RdciSubSystem.cc @@ -20,134 +20,126 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "RdciSubSystem.h" + #include -#include "rdc_lib/RdcException.h" -#include "common/rdc_utils.h" + #include "common/rdc_fields_supported.h" +#include "common/rdc_utils.h" +#include "rdc_lib/RdcException.h" namespace amd { namespace rdc { -RdciSubSystem::RdciSubSystem(): - rdc_handle_(nullptr) - , ip_port_("localhost:50051") // default host - , use_auth_(true) - , root_ca_("/etc/rdc/client/certs/rdc_cacert.pem") - , client_cert_("/etc/rdc/client/certs/rdc_client_cert.pem") - , client_key_("/etc/rdc/client/private/rdc_client_cert.key") - , is_json_output_(false) { - rdc_status_t status = rdc_init(0); - if (status != RDC_ST_OK) { - throw RdcException(status, "RDC initialize fail"); - } +RdciSubSystem::RdciSubSystem() + : rdc_handle_(nullptr), + ip_port_("localhost:50051") // default host + , + use_auth_(true), + root_ca_("/etc/rdc/client/certs/rdc_cacert.pem"), + client_cert_("/etc/rdc/client/certs/rdc_client_cert.pem"), + client_key_("/etc/rdc/client/private/rdc_client_cert.key"), + is_json_output_(false) { + rdc_status_t status = rdc_init(0); + if (status != RDC_ST_OK) { + throw RdcException(status, "RDC initialize fail"); + } } -bool RdciSubSystem::is_json_output() const { - return is_json_output_; -} +bool RdciSubSystem::is_json_output() const { return is_json_output_; } -std::vector RdciSubSystem::split_string(const std::string& s, - char delimiter) const { - std::vector tokens; - std::string token; - std::istringstream tokenStream(s); - while (std::getline(tokenStream, token, delimiter)) { - tokens.push_back(token); - } - return tokens; +std::vector RdciSubSystem::split_string(const std::string& s, char delimiter) const { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; } void RdciSubSystem::connect() { - rdc_status_t status; + rdc_status_t status; - if (use_auth_) { - std::string ca_pem; - std::string client_cert_pem; - std::string client_key_pem; + if (use_auth_) { + std::string ca_pem; + std::string client_cert_pem; + std::string client_key_pem; - if (!FileExists(root_ca_.c_str())) { - std::cout << "In order to use the SSL mutual authentication, the " + if (!FileExists(root_ca_.c_str())) { + std::cout << "In order to use the SSL mutual authentication, the " << "root CA must be copied to " << root_ca_ << std::endl; - throw RdcException(RDC_ST_BAD_PARAMETER, "root CA not found"); - } - int ret = ReadFile(root_ca_, &ca_pem); - if (ret) { - throw RdcException(RDC_ST_BAD_PARAMETER, - std::string("Fail to read root CA at") + root_ca_); - } - if (!FileExists(client_cert_.c_str())) { - std::cout << "In order to use the SSL mutual authentication, the " - << "client certificate must be copied to " - << client_cert_ << std::endl; - throw RdcException(RDC_ST_BAD_PARAMETER, - "client cert not found"); - } - ret = ReadFile(client_cert_, &client_cert_pem); - if (ret) { - throw RdcException(RDC_ST_BAD_PARAMETER, - std::string("Fail to read client certificate at") + client_cert_); - } - if (!FileExists(client_key_.c_str())) { - std::cout << "In order to use the SSL mutual authentication, the " - << "client private key must be copied to " - << client_key_ << std::endl; - throw RdcException(RDC_ST_BAD_PARAMETER, - "client key not found"); - } - ret = ReadFile(client_key_, &client_key_pem); - if (ret) { - throw RdcException(RDC_ST_BAD_PARAMETER, - std::string("Fail to read client key at ") + client_key_); - } - - status = rdc_connect(ip_port_.c_str(), &rdc_handle_, - ca_pem.c_str(), client_cert_pem.c_str(), client_key_pem.c_str()); - } else { // Not use the SSL mutual authentication - status = rdc_connect(ip_port_.c_str(), &rdc_handle_, - nullptr, nullptr, nullptr); + throw RdcException(RDC_ST_BAD_PARAMETER, "root CA not found"); + } + int ret = ReadFile(root_ca_, &ca_pem); + if (ret) { + throw RdcException(RDC_ST_BAD_PARAMETER, std::string("Fail to read root CA at") + root_ca_); + } + if (!FileExists(client_cert_.c_str())) { + std::cout << "In order to use the SSL mutual authentication, the " + << "client certificate must be copied to " << client_cert_ << std::endl; + throw RdcException(RDC_ST_BAD_PARAMETER, "client cert not found"); + } + ret = ReadFile(client_cert_, &client_cert_pem); + if (ret) { + throw RdcException(RDC_ST_BAD_PARAMETER, + std::string("Fail to read client certificate at") + client_cert_); + } + if (!FileExists(client_key_.c_str())) { + std::cout << "In order to use the SSL mutual authentication, the " + << "client private key must be copied to " << client_key_ << std::endl; + throw RdcException(RDC_ST_BAD_PARAMETER, "client key not found"); + } + ret = ReadFile(client_key_, &client_key_pem); + if (ret) { + throw RdcException(RDC_ST_BAD_PARAMETER, + std::string("Fail to read client key at ") + client_key_); } - if (status != RDC_ST_OK) { - throw RdcException(status, - "Fail to setup the connection. Please check all libraries in right folder"); - } + status = rdc_connect(ip_port_.c_str(), &rdc_handle_, ca_pem.c_str(), client_cert_pem.c_str(), + client_key_pem.c_str()); + } else { // Not use the SSL mutual authentication + status = rdc_connect(ip_port_.c_str(), &rdc_handle_, nullptr, nullptr, nullptr); + } + + if (status != RDC_ST_OK) { + throw RdcException(status, + "Fail to setup the connection. Please check all " + "libraries in right folder"); + } } void RdciSubSystem::show_common_usage() const { - std::cout << " --host :port Connects to " + std::cout << " --host :port Connects to " << "specified IP or fully-qualified domain name.\n"; - std::cout << " The port " + std::cout << " The port " << "must be specified.\n"; - std::cout << " Default: localhost:50051\n"; - std::cout << " -u --unauth Do not use the SSL mutual" + std::cout << " Default: localhost:50051\n"; + std::cout << " -u --unauth Do not use the SSL mutual" << " authentication to encrypt the communication\n" << " Default: SSL mutual will be" - << " used. You must copy the root CA to " - << root_ca_ << "\n" - << " Client certificate to " - << client_cert_ << "\n" - << " Client key to " - << client_key_ << "\n"; - std::cout << " -h --help Displays usage " - << "information and exits.\n"; + << " used. You must copy the root CA to " << root_ca_ << "\n" + << " Client certificate to " << client_cert_ << "\n" + << " Client key to " << client_key_ << "\n"; + std::cout << " -h --help Displays usage " + << "information and exits.\n"; } void RdciSubSystem::set_json_output(bool is_json) { - is_json_output_ = is_json; - std::cout << "{"; + is_json_output_ = is_json; + std::cout << "{"; } RdciSubSystem::~RdciSubSystem() { - if (rdc_handle_) { - rdc_disconnect(rdc_handle_); - rdc_handle_ = nullptr; - } + if (rdc_handle_) { + rdc_disconnect(rdc_handle_); + rdc_handle_ = nullptr; + } - rdc_shutdown(); + rdc_shutdown(); - if (is_json_output_) { - std::cout << "}" << std::endl; - } + if (is_json_output_) { + std::cout << "}" << std::endl; + } } } // namespace rdc diff --git a/projects/rdc/rdci/src/rdci.cc b/projects/rdc/rdci/src/rdci.cc index f3ee185d52..5da66a68b1 100644 --- a/projects/rdc/rdci/src/rdci.cc +++ b/projects/rdc/rdci/src/rdci.cc @@ -22,68 +22,68 @@ THE SOFTWARE. #include #include -#include "rdc_lib/rdc_common.h" -#include "rdc/rdc.h" -#include "rdc_lib/RdcException.h" + +#include "RdciDiagSubSystem.h" #include "RdciDiscoverySubSystem.h" #include "RdciDmonSubSystem.h" -#include "RdciDiagSubSystem.h" #include "RdciFieldGroupSubSystem.h" #include "RdciGroupSubSystem.h" #include "RdciStatsSubSystem.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" +int main(int argc, char** argv) { + const std::string usage_help = + "Usage:\trdci \nsubsystem: discovery, dmon, group, " + "fieldgroup, stats, diag\n"; -int main(int argc, char ** argv) { - const std::string usage_help = - "Usage:\trdci \nsubsystem: discovery, dmon, group, " - "fieldgroup, stats, diag\n"; + if (argc <= 1) { + std::cout << usage_help; + exit(0); + } - if (argc <= 1) { - std::cout << usage_help; - exit(0); + amd::rdc::RdciSubSystemPtr subsystem; + try { + std::string subsystem_name = argv[1]; + if (subsystem_name == "discovery") { + subsystem.reset(new amd::rdc::RdciDiscoverySubSystem()); + } else if (subsystem_name == "dmon") { + subsystem.reset(new amd::rdc::RdciDmonSubSystem()); + } else if (subsystem_name == "diag") { + subsystem.reset(new amd::rdc::RdciDiagSubSystem()); + } else if (subsystem_name == "group") { + subsystem.reset(new amd::rdc::RdciGroupSubSystem()); + } else if (subsystem_name == "fieldgroup") { + subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem()); + } else if (subsystem_name == "stats") { + subsystem.reset(new amd::rdc::RdciStatsSubSystem()); + } else { + std::cout << usage_help; + exit(0); } - amd::rdc::RdciSubSystemPtr subsystem; - try { - std::string subsystem_name = argv[1]; - if (subsystem_name == "discovery") { - subsystem.reset(new amd::rdc::RdciDiscoverySubSystem()); - } else if (subsystem_name == "dmon") { - subsystem.reset(new amd::rdc::RdciDmonSubSystem()); - } else if (subsystem_name == "diag") { - subsystem.reset(new amd::rdc::RdciDiagSubSystem()); - } else if (subsystem_name == "group") { - subsystem.reset(new amd::rdc::RdciGroupSubSystem()); - } else if (subsystem_name == "fieldgroup") { - subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem()); - } else if (subsystem_name == "stats") { - subsystem.reset(new amd::rdc::RdciStatsSubSystem()); - } else { - std::cout << usage_help; - exit(0); - } + subsystem->parse_cmd_opts(argc, argv); - subsystem->parse_cmd_opts(argc, argv); + subsystem->connect(); - subsystem->connect(); - - subsystem->process(); - } catch (const amd::rdc::RdcException& e) { - if (subsystem && subsystem->is_json_output()) { - std::cout << "\"status\": \"error\", \"description\": \"" - << e.what() << '"'; - } else { - std::cout << "rdci Error: " << e.what() << std::endl; - } - return e.error_code(); - } catch (...) { - if (subsystem && subsystem->is_json_output()) { - std::cout << "\"status\": \"error\", \"description\": " - << "\"Unhandled exception.\""; - } else { - std::cout << "Unhandled exception." << std::endl; - } return 1; + subsystem->process(); + } catch (const amd::rdc::RdcException& e) { + if (subsystem && subsystem->is_json_output()) { + std::cout << "\"status\": \"error\", \"description\": \"" << e.what() << '"'; + } else { + std::cout << "rdci Error: " << e.what() << std::endl; } + return e.error_code(); + } catch (...) { + if (subsystem && subsystem->is_json_output()) { + std::cout << "\"status\": \"error\", \"description\": " + << "\"Unhandled exception.\""; + } else { + std::cout << "Unhandled exception." << std::endl; + } + return 1; + } - return 0; + return 0; } diff --git a/projects/rdc/server/include/rdc/rdc_admin_service.h b/projects/rdc/server/include/rdc/rdc_admin_service.h old mode 100755 new mode 100644 index e7d93dd5ab..71730ba96d --- a/projects/rdc/server/include/rdc/rdc_admin_service.h +++ b/projects/rdc/server/include/rdc/rdc_admin_service.h @@ -23,19 +23,20 @@ THE SOFTWARE. #define SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_ #include "rdc.grpc.pb.h" // NOLINT -#include "rocm_smi/rocm_smi.h" #include "rdc/rdc_admin_service.h" +#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { class RDCAdminServiceImpl final : public ::rdc::RdcAdmin::Service { public: - RDCAdminServiceImpl(); - ~RDCAdminServiceImpl(); - ::grpc::Status VerifyConnection(::grpc::ServerContext* context, - const ::rdc::VerifyConnectionRequest* request, - ::rdc::VerifyConnectionResponse* reply) override; + RDCAdminServiceImpl(); + ~RDCAdminServiceImpl(); + ::grpc::Status VerifyConnection(::grpc::ServerContext* context, + const ::rdc::VerifyConnectionRequest* request, + ::rdc::VerifyConnectionResponse* reply) override; + private: }; diff --git a/projects/rdc/server/include/rdc/rdc_api_service.h b/projects/rdc/server/include/rdc/rdc_api_service.h old mode 100755 new mode 100644 index 515640d596..8572335afb --- a/projects/rdc/server/include/rdc/rdc_api_service.h +++ b/projects/rdc/server/include/rdc/rdc_api_service.h @@ -30,108 +30,101 @@ namespace rdc { class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { public: - RdcAPIServiceImpl(); - ~RdcAPIServiceImpl(); + RdcAPIServiceImpl(); + ~RdcAPIServiceImpl(); - rdc_status_t Initialize(uint64_t rdcd_init_flags = 0); + rdc_status_t Initialize(uint64_t rdcd_init_flags = 0); - ::grpc::Status GetAllDevices(::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::GetAllDevicesResponse* reply) override; + ::grpc::Status GetAllDevices(::grpc::ServerContext* context, const ::rdc::Empty* request, + ::rdc::GetAllDevicesResponse* reply) override; - ::grpc::Status GetDeviceAttributes(::grpc::ServerContext* context, - const ::rdc::GetDeviceAttributesRequest* request, - ::rdc::GetDeviceAttributesResponse* reply) override; + ::grpc::Status GetDeviceAttributes(::grpc::ServerContext* context, + const ::rdc::GetDeviceAttributesRequest* request, + ::rdc::GetDeviceAttributesResponse* reply) override; - ::grpc::Status CreateGpuGroup(::grpc::ServerContext* context, - const ::rdc::CreateGpuGroupRequest* request, - ::rdc::CreateGpuGroupResponse* reply) override; + ::grpc::Status CreateGpuGroup(::grpc::ServerContext* context, + const ::rdc::CreateGpuGroupRequest* request, + ::rdc::CreateGpuGroupResponse* reply) override; - ::grpc::Status AddToGpuGroup(::grpc::ServerContext* context, - const ::rdc::AddToGpuGroupRequest* request, - ::rdc::AddToGpuGroupResponse* reply) override; + ::grpc::Status AddToGpuGroup(::grpc::ServerContext* context, + const ::rdc::AddToGpuGroupRequest* request, + ::rdc::AddToGpuGroupResponse* reply) override; - ::grpc::Status GetGpuGroupInfo(::grpc::ServerContext* context, - const ::rdc::GetGpuGroupInfoRequest* request, - ::rdc::GetGpuGroupInfoResponse* reply) override; + ::grpc::Status GetGpuGroupInfo(::grpc::ServerContext* context, + const ::rdc::GetGpuGroupInfoRequest* request, + ::rdc::GetGpuGroupInfoResponse* reply) override; - ::grpc::Status GetGroupAllIds(::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::GetGroupAllIdsResponse* reply) override; + ::grpc::Status GetGroupAllIds(::grpc::ServerContext* context, const ::rdc::Empty* request, + ::rdc::GetGroupAllIdsResponse* reply) override; - ::grpc::Status DestroyGpuGroup(::grpc::ServerContext* context, - const ::rdc::DestroyGpuGroupRequest* request, - ::rdc::DestroyGpuGroupResponse* reply) override; + ::grpc::Status DestroyGpuGroup(::grpc::ServerContext* context, + const ::rdc::DestroyGpuGroupRequest* request, + ::rdc::DestroyGpuGroupResponse* reply) override; - ::grpc::Status CreateFieldGroup(::grpc::ServerContext* context, - const ::rdc::CreateFieldGroupRequest* request, - ::rdc::CreateFieldGroupResponse* reply) override; + ::grpc::Status CreateFieldGroup(::grpc::ServerContext* context, + const ::rdc::CreateFieldGroupRequest* request, + ::rdc::CreateFieldGroupResponse* reply) override; - ::grpc::Status GetFieldGroupInfo(::grpc::ServerContext* context, - const ::rdc::GetFieldGroupInfoRequest* request, - ::rdc::GetFieldGroupInfoResponse* reply) override; + ::grpc::Status GetFieldGroupInfo(::grpc::ServerContext* context, + const ::rdc::GetFieldGroupInfoRequest* request, + ::rdc::GetFieldGroupInfoResponse* reply) override; - ::grpc::Status GetFieldGroupAllIds(::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::GetFieldGroupAllIdsResponse* reply) override; + ::grpc::Status GetFieldGroupAllIds(::grpc::ServerContext* context, const ::rdc::Empty* request, + ::rdc::GetFieldGroupAllIdsResponse* reply) override; - ::grpc::Status DestroyFieldGroup(::grpc::ServerContext* context, - const ::rdc::DestroyFieldGroupRequest* request, - ::rdc::DestroyFieldGroupResponse* reply) override; + ::grpc::Status DestroyFieldGroup(::grpc::ServerContext* context, + const ::rdc::DestroyFieldGroupRequest* request, + ::rdc::DestroyFieldGroupResponse* reply) override; - ::grpc::Status WatchFields(::grpc::ServerContext* context, - const ::rdc::WatchFieldsRequest* request, - ::rdc::WatchFieldsResponse* reply) override; + ::grpc::Status WatchFields(::grpc::ServerContext* context, + const ::rdc::WatchFieldsRequest* request, + ::rdc::WatchFieldsResponse* reply) override; - ::grpc::Status GetLatestFieldValue(::grpc::ServerContext* context, - const ::rdc::GetLatestFieldValueRequest* request, - ::rdc::GetLatestFieldValueResponse* reply) override; + ::grpc::Status GetLatestFieldValue(::grpc::ServerContext* context, + const ::rdc::GetLatestFieldValueRequest* request, + ::rdc::GetLatestFieldValueResponse* reply) override; - ::grpc::Status GetFieldSince(::grpc::ServerContext* context, - const ::rdc::GetFieldSinceRequest* request, - ::rdc::GetFieldSinceResponse* reply) override; + ::grpc::Status GetFieldSince(::grpc::ServerContext* context, + const ::rdc::GetFieldSinceRequest* request, + ::rdc::GetFieldSinceResponse* reply) override; - ::grpc::Status UnWatchFields(::grpc::ServerContext* context, - const ::rdc::UnWatchFieldsRequest* request, - ::rdc::UnWatchFieldsResponse* reply) override; + ::grpc::Status UnWatchFields(::grpc::ServerContext* context, + const ::rdc::UnWatchFieldsRequest* request, + ::rdc::UnWatchFieldsResponse* reply) override; - ::grpc::Status UpdateAllFields(::grpc::ServerContext* context, - const ::rdc::UpdateAllFieldsRequest* request, - ::rdc::UpdateAllFieldsResponse* reply) override; + ::grpc::Status UpdateAllFields(::grpc::ServerContext* context, + const ::rdc::UpdateAllFieldsRequest* request, + ::rdc::UpdateAllFieldsResponse* reply) override; + ::grpc::Status StartJobStats(::grpc::ServerContext* context, + const ::rdc::StartJobStatsRequest* request, + ::rdc::StartJobStatsResponse* reply) override; - ::grpc::Status StartJobStats(::grpc::ServerContext* context, - const ::rdc::StartJobStatsRequest* request, - ::rdc::StartJobStatsResponse* reply) override; + ::grpc::Status GetJobStats(::grpc::ServerContext* context, + const ::rdc::GetJobStatsRequest* request, + ::rdc::GetJobStatsResponse* reply) override; - ::grpc::Status GetJobStats(::grpc::ServerContext* context, - const ::rdc::GetJobStatsRequest* request, - ::rdc::GetJobStatsResponse* reply) override; + ::grpc::Status StopJobStats(::grpc::ServerContext* context, + const ::rdc::StopJobStatsRequest* request, + ::rdc::StopJobStatsResponse* reply) override; - ::grpc::Status StopJobStats(::grpc::ServerContext* context, - const ::rdc::StopJobStatsRequest* request, - ::rdc::StopJobStatsResponse* reply) override; + ::grpc::Status RemoveJob(::grpc::ServerContext* context, const ::rdc::RemoveJobRequest* request, + ::rdc::RemoveJobResponse* reply) override; - ::grpc::Status RemoveJob(::grpc::ServerContext* context, - const ::rdc::RemoveJobRequest* request, - ::rdc::RemoveJobResponse* reply) override; + ::grpc::Status RemoveAllJob(::grpc::ServerContext* context, const ::rdc::Empty* request, + ::rdc::RemoveAllJobResponse* reply) override; - ::grpc::Status RemoveAllJob(::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::RemoveAllJobResponse* reply) override; + ::grpc::Status DiagnosticRun(::grpc::ServerContext* context, + const ::rdc::DiagnosticRunRequest* request, + ::rdc::DiagnosticRunResponse* reply) override; - ::grpc::Status DiagnosticRun(::grpc::ServerContext* context, - const ::rdc::DiagnosticRunRequest* request, - ::rdc::DiagnosticRunResponse* reply) override; - - ::grpc::Status DiagnosticTestCaseRun(::grpc::ServerContext* context, - const ::rdc::DiagnosticTestCaseRunRequest* request, - ::rdc::DiagnosticTestCaseRunResponse* reply) override; + ::grpc::Status DiagnosticTestCaseRun(::grpc::ServerContext* context, + const ::rdc::DiagnosticTestCaseRunRequest* request, + ::rdc::DiagnosticTestCaseRunResponse* reply) override; private: - bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, - ::rdc::GpuUsageInfo* target); - rdc_handle_t rdc_handle_; + bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target); + rdc_handle_t rdc_handle_; }; } // namespace rdc diff --git a/projects/rdc/server/include/rdc/rdc_rsmi_service.h b/projects/rdc/server/include/rdc/rdc_rsmi_service.h old mode 100755 new mode 100644 index 40bc7907d7..ea68c1a0f3 --- a/projects/rdc/server/include/rdc/rdc_rsmi_service.h +++ b/projects/rdc/server/include/rdc/rdc_rsmi_service.h @@ -23,46 +23,40 @@ THE SOFTWARE. #define SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_ #include "rdc.grpc.pb.h" // NOLINT -#include "rocm_smi/rocm_smi.h" #include "rdc/rdc_rsmi_service.h" +#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { class RsmiServiceImpl final : public ::rdc::Rsmi::Service { public: - RsmiServiceImpl(); - ~RsmiServiceImpl(); + RsmiServiceImpl(); + ~RsmiServiceImpl(); - rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0); + rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0); - ::grpc::Status - GetNumDevices(::grpc::ServerContext* context, - const ::rdc::GetNumDevicesRequest* request, - ::rdc::GetNumDevicesResponse* reply) override; + ::grpc::Status GetNumDevices(::grpc::ServerContext* context, + const ::rdc::GetNumDevicesRequest* request, + ::rdc::GetNumDevicesResponse* reply) override; - ::grpc::Status - GetTemperature(::grpc::ServerContext* context, - const ::rdc::GetTemperatureRequest* request, - ::rdc::GetTemperatureResponse* response) override; + ::grpc::Status GetTemperature(::grpc::ServerContext* context, + const ::rdc::GetTemperatureRequest* request, + ::rdc::GetTemperatureResponse* response) override; - ::grpc::Status - GetFanRpms(::grpc::ServerContext* context, - const ::rdc::GetFanRpmsRequest* request, - ::rdc::GetFanRpmsResponse* response) override; + ::grpc::Status GetFanRpms(::grpc::ServerContext* context, const ::rdc::GetFanRpmsRequest* request, + ::rdc::GetFanRpmsResponse* response) override; - ::grpc::Status - GetFanSpeed(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedRequest* request, - ::rdc::GetFanSpeedResponse* response) override; + ::grpc::Status GetFanSpeed(::grpc::ServerContext* context, + const ::rdc::GetFanSpeedRequest* request, + ::rdc::GetFanSpeedResponse* response) override; - ::grpc::Status - GetFanSpeedMax(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedMaxRequest* request, - ::rdc::GetFanSpeedMaxResponse* response) override; + ::grpc::Status GetFanSpeedMax(::grpc::ServerContext* context, + const ::rdc::GetFanSpeedMaxRequest* request, + ::rdc::GetFanSpeedMaxResponse* response) override; private: - bool rsmi_initialized_; + bool rsmi_initialized_; }; } // namespace rdc diff --git a/projects/rdc/server/include/rdc/rdc_server_main.h b/projects/rdc/server/include/rdc/rdc_server_main.h old mode 100755 new mode 100644 index a62d801d32..73bd4f5050 --- a/projects/rdc/server/include/rdc/rdc_server_main.h +++ b/projects/rdc/server/include/rdc/rdc_server_main.h @@ -24,12 +24,12 @@ THE SOFTWARE. #include -#include #include +#include -#include "rdc/rdc_rsmi_service.h" #include "rdc/rdc_admin_service.h" #include "rdc/rdc_api_service.h" +#include "rdc/rdc_rsmi_service.h" typedef struct { std::string listen_address; @@ -41,43 +41,42 @@ typedef struct { class RDCServer { public: - RDCServer(); - ~RDCServer(); + RDCServer(); + ~RDCServer(); - void Initialize(RdcdCmdLineOpts *cl); + void Initialize(RdcdCmdLineOpts* cl); - void Run(void); - void ShutDown(void); + void Run(void); + void ShutDown(void); - bool start_rsmi_service(void) const {return start_rsmi_service_;} - void set_start_rsmi_service(bool s) {start_rsmi_service_ = s;} + bool start_rsmi_service(void) const { return start_rsmi_service_; } + void set_start_rsmi_service(bool s) { start_rsmi_service_ = s; } - bool start_rdc_admin_service(void) const {return start_rdc_admin_service_;} - void set_start_rdc_admin_service(bool s) {start_rdc_admin_service_ = s;} + bool start_rdc_admin_service(void) const { return start_rdc_admin_service_; } + void set_start_rdc_admin_service(bool s) { start_rdc_admin_service_ = s; } - bool start_api_service(void) const {return start_api_service_;} - void set_start_api_service(bool s) {start_api_service_ = s;} + bool start_api_service(void) const { return start_api_service_; } + void set_start_api_service(bool s) { start_api_service_ = s; } - bool secure_creds(void) const {return secure_creds_;} - void set_secure_creds(bool s) {secure_creds_ = s;} + bool secure_creds(void) const { return secure_creds_; } + void set_secure_creds(bool s) { secure_creds_ = s; } private: - void HandleSignal(int sig); - std::string server_address_; - std::unique_ptr<::grpc::Server> server_; - bool secure_creds_; - bool use_pinned_certs_; - bool log_debug_; - bool start_rsmi_service_; - amd::rdc::RsmiServiceImpl *rsmi_service_; - RdcdCmdLineOpts *cmd_line_; + void HandleSignal(int sig); + std::string server_address_; + std::unique_ptr<::grpc::Server> server_; + bool secure_creds_; + bool use_pinned_certs_; + bool log_debug_; + bool start_rsmi_service_; + amd::rdc::RsmiServiceImpl* rsmi_service_; + RdcdCmdLineOpts* cmd_line_; - bool start_rdc_admin_service_; - amd::rdc::RDCAdminServiceImpl *rdc_admin_service_; + bool start_rdc_admin_service_; + amd::rdc::RDCAdminServiceImpl* rdc_admin_service_; - bool start_api_service_; - amd::rdc::RdcAPIServiceImpl *api_service_; + bool start_api_service_; + amd::rdc::RdcAPIServiceImpl* api_service_; }; #endif // SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_ - diff --git a/projects/rdc/server/src/rdc_admin_service.cc b/projects/rdc/server/src/rdc_admin_service.cc old mode 100755 new mode 100644 index 5c6fb23372..25b6ed5394 --- a/projects/rdc/server/src/rdc_admin_service.cc +++ b/projects/rdc/server/src/rdc_admin_service.cc @@ -20,31 +20,29 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc/rdc_admin_service.h" + #include #include #include -#include -#include -#include -#include #include +#include +#include +#include +#include #include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_admin_service.h" namespace amd { namespace rdc { -RDCAdminServiceImpl::RDCAdminServiceImpl() { -} +RDCAdminServiceImpl::RDCAdminServiceImpl() {} -RDCAdminServiceImpl::~RDCAdminServiceImpl() { -} -::grpc::Status -RDCAdminServiceImpl::VerifyConnection(::grpc::ServerContext* context, - const ::rdc::VerifyConnectionRequest* request, - ::rdc::VerifyConnectionResponse* reply) { +RDCAdminServiceImpl::~RDCAdminServiceImpl() {} +::grpc::Status RDCAdminServiceImpl::VerifyConnection(::grpc::ServerContext* context, + const ::rdc::VerifyConnectionRequest* request, + ::rdc::VerifyConnectionResponse* reply) { (void)context; // Quiet warning for now reply->set_echo_magic_num(request->magic_num()); diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc old mode 100755 new mode 100644 index fc4c68282f..83528d1df1 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -19,16 +19,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc/rdc_api_service.h" + #include #include +#include #include #include #include -#include #include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_api_service.h" #include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" @@ -36,22 +37,21 @@ THE SOFTWARE. namespace amd { namespace rdc { -RdcAPIServiceImpl::RdcAPIServiceImpl():rdc_handle_(nullptr) { -} +RdcAPIServiceImpl::RdcAPIServiceImpl() : rdc_handle_(nullptr) {} rdc_status_t RdcAPIServiceImpl::Initialize(uint64_t rdcd_init_flags) { - rdc_status_t result = rdc_init(rdcd_init_flags); - if (result != RDC_ST_OK) { - std::cout << "RDC API initialize fail\n"; - return result; - } - - result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle_); - if (result != RDC_ST_OK) { - std::cout << "RDC API start embedded fail\n"; - } - + rdc_status_t result = rdc_init(rdcd_init_flags); + if (result != RDC_ST_OK) { + std::cout << "RDC API initialize fail\n"; return result; + } + + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle_); + if (result != RDC_ST_OK) { + std::cout << "RDC API start embedded fail\n"; + } + + return result; } RdcAPIServiceImpl::~RdcAPIServiceImpl() { @@ -63,643 +63,583 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { } ::grpc::Status RdcAPIServiceImpl::GetAllDevices(::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::GetAllDevicesResponse* reply) { - (void)(context); - (void)(request); - if (!reply) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply"); - } - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; - uint32_t count = 0; - rdc_status_t result = rdc_device_get_all(rdc_handle_, - gpu_index_list, &count); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - for (uint32_t i = 0; i < count; i++) { - reply->add_gpus(gpu_index_list[i]); - } - + const ::rdc::Empty* request, + ::rdc::GetAllDevicesResponse* reply) { + (void)(context); + (void)(request); + if (!reply) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply"); + } + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; + rdc_status_t result = rdc_device_get_all(rdc_handle_, gpu_index_list, &count); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + for (uint32_t i = 0; i < count; i++) { + reply->add_gpus(gpu_index_list[i]); + } + + return ::grpc::Status::OK; } ::grpc::Status RdcAPIServiceImpl::GetDeviceAttributes( - ::grpc::ServerContext* context, - const ::rdc::GetDeviceAttributesRequest* request, - ::rdc::GetDeviceAttributesResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - uint32_t gpu_index = request->gpu_index(); - rdc_device_attributes_t attribute; - rdc_status_t result = rdc_device_get_attributes(rdc_handle_, - gpu_index, &attribute); + ::grpc::ServerContext* context, const ::rdc::GetDeviceAttributesRequest* request, + ::rdc::GetDeviceAttributesResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + uint32_t gpu_index = request->gpu_index(); + rdc_device_attributes_t attribute; + rdc_status_t result = rdc_device_get_attributes(rdc_handle_, gpu_index, &attribute); - ::rdc::DeviceAttributes* attr = reply->mutable_attributes(); - attr->set_device_name(attribute.device_name); + ::rdc::DeviceAttributes* attr = reply->mutable_attributes(); + attr->set_device_name(attribute.device_name); - reply->set_status(result); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::CreateGpuGroup( - ::grpc::ServerContext* context, - const ::rdc::CreateGpuGroupRequest* request, - ::rdc::CreateGpuGroupResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_gpu_group_t group_id = 0; - rdc_status_t result = rdc_group_gpu_create(rdc_handle_, - static_cast(request->type()), - request->group_name().c_str(), &group_id); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_group_id(group_id); +::grpc::Status RdcAPIServiceImpl::CreateGpuGroup(::grpc::ServerContext* context, + const ::rdc::CreateGpuGroupRequest* request, + ::rdc::CreateGpuGroupResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_gpu_group_t group_id = 0; + rdc_status_t result = + rdc_group_gpu_create(rdc_handle_, static_cast(request->type()), + request->group_name().c_str(), &group_id); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_group_id(group_id); + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::AddToGpuGroup( - ::grpc::ServerContext* context, - const ::rdc::AddToGpuGroupRequest* request, - ::rdc::AddToGpuGroupResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::AddToGpuGroup(::grpc::ServerContext* context, + const ::rdc::AddToGpuGroupRequest* request, + ::rdc::AddToGpuGroupResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_status_t result = rdc_group_gpu_add(rdc_handle_, - request->group_id(), request->gpu_index()); - reply->set_status(result); + rdc_status_t result = rdc_group_gpu_add(rdc_handle_, request->group_id(), request->gpu_index()); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetGpuGroupInfo( - ::grpc::ServerContext* context, - const ::rdc::GetGpuGroupInfoRequest* request, - ::rdc::GetGpuGroupInfoResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_group_info_t group_info; - rdc_status_t result = rdc_group_gpu_get_info( - rdc_handle_, request->group_id(), &group_info); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_group_name(group_info.group_name); - for (uint32_t i = 0; i < group_info.count; i++) { - reply->add_entity_ids(group_info.entity_ids[i]); - } +::grpc::Status RdcAPIServiceImpl::GetGpuGroupInfo(::grpc::ServerContext* context, + const ::rdc::GetGpuGroupInfoRequest* request, + ::rdc::GetGpuGroupInfoResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_group_info_t group_info; + rdc_status_t result = rdc_group_gpu_get_info(rdc_handle_, request->group_id(), &group_info); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_group_name(group_info.group_name); + for (uint32_t i = 0; i < group_info.count; i++) { + reply->add_entity_ids(group_info.entity_ids[i]); + } + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetGroupAllIds( - ::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::GetGroupAllIdsResponse* reply) { - if (!reply || !request || !context) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_gpu_group_t group_id_list[RDC_MAX_NUM_GROUPS]; - uint32_t count = 0; - rdc_status_t result = rdc_group_get_all_ids( - rdc_handle_, group_id_list, &count); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - for (uint32_t i = 0; i < count; i++) { - reply->add_group_ids(group_id_list[i]); - } +::grpc::Status RdcAPIServiceImpl::GetGroupAllIds(::grpc::ServerContext* context, + const ::rdc::Empty* request, + ::rdc::GetGroupAllIdsResponse* reply) { + if (!reply || !request || !context) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_gpu_group_t group_id_list[RDC_MAX_NUM_GROUPS]; + uint32_t count = 0; + rdc_status_t result = rdc_group_get_all_ids(rdc_handle_, group_id_list, &count); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + for (uint32_t i = 0; i < count; i++) { + reply->add_group_ids(group_id_list[i]); + } + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetFieldGroupAllIds( - ::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::GetFieldGroupAllIdsResponse* reply) { - if (!reply || !request || !context) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_field_grp_t field_group_id_list[RDC_MAX_NUM_FIELD_GROUPS]; - uint32_t count = 0; - rdc_status_t result = rdc_group_field_get_all_ids( - rdc_handle_, field_group_id_list, &count); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - for (uint32_t i = 0; i < count; i++) { - reply->add_field_group_ids(field_group_id_list[i]); - } +::grpc::Status RdcAPIServiceImpl::GetFieldGroupAllIds(::grpc::ServerContext* context, + const ::rdc::Empty* request, + ::rdc::GetFieldGroupAllIdsResponse* reply) { + if (!reply || !request || !context) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_field_grp_t field_group_id_list[RDC_MAX_NUM_FIELD_GROUPS]; + uint32_t count = 0; + rdc_status_t result = rdc_group_field_get_all_ids(rdc_handle_, field_group_id_list, &count); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + for (uint32_t i = 0; i < count; i++) { + reply->add_field_group_ids(field_group_id_list[i]); + } + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::DestroyGpuGroup( - ::grpc::ServerContext* context, - const ::rdc::DestroyGpuGroupRequest* request, - ::rdc::DestroyGpuGroupResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::DestroyGpuGroup(::grpc::ServerContext* context, + const ::rdc::DestroyGpuGroupRequest* request, + ::rdc::DestroyGpuGroupResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_status_t result = rdc_group_gpu_destroy( - rdc_handle_, request->group_id()); - reply->set_status(result); + rdc_status_t result = rdc_group_gpu_destroy(rdc_handle_, request->group_id()); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::CreateFieldGroup(::grpc::ServerContext* context, + const ::rdc::CreateFieldGroupRequest* request, + ::rdc::CreateFieldGroupResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } -::grpc::Status RdcAPIServiceImpl::CreateFieldGroup( - ::grpc::ServerContext* context, - const ::rdc::CreateFieldGroupRequest* request, - ::rdc::CreateFieldGroupResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_field_grp_t field_group_id; - rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; - for (int i = 0; i < request->field_ids_size(); i++) { - field_ids[i] = static_cast(request->field_ids(i)); - } - rdc_status_t result = rdc_group_field_create( - rdc_handle_, request->field_ids_size() , &field_ids[0], - request->field_group_name().c_str(), &field_group_id); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_field_group_id(field_group_id); - + rdc_field_grp_t field_group_id; + rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; + for (int i = 0; i < request->field_ids_size(); i++) { + field_ids[i] = static_cast(request->field_ids(i)); + } + rdc_status_t result = + rdc_group_field_create(rdc_handle_, request->field_ids_size(), &field_ids[0], + request->field_group_name().c_str(), &field_group_id); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_field_group_id(field_group_id); + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetFieldGroupInfo( - ::grpc::ServerContext* context, - const ::rdc::GetFieldGroupInfoRequest* request, - ::rdc::GetFieldGroupInfoResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_field_group_info_t field_info; - rdc_status_t result = rdc_group_field_get_info( - rdc_handle_, request->field_group_id(), &field_info); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_filed_group_name(field_info.group_name); - for (uint32_t i = 0; i < field_info.count; i++) { - reply->add_field_ids(field_info.field_ids[i]); - } +::grpc::Status RdcAPIServiceImpl::GetFieldGroupInfo(::grpc::ServerContext* context, + const ::rdc::GetFieldGroupInfoRequest* request, + ::rdc::GetFieldGroupInfoResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_field_group_info_t field_info; + rdc_status_t result = + rdc_group_field_get_info(rdc_handle_, request->field_group_id(), &field_info); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_filed_group_name(field_info.group_name); + for (uint32_t i = 0; i < field_info.count; i++) { + reply->add_field_ids(field_info.field_ids[i]); + } + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::DestroyFieldGroup( - ::grpc::ServerContext* context, - const ::rdc::DestroyFieldGroupRequest* request, - ::rdc::DestroyFieldGroupResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::DestroyFieldGroup(::grpc::ServerContext* context, + const ::rdc::DestroyFieldGroupRequest* request, + ::rdc::DestroyFieldGroupResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_status_t result = rdc_group_field_destroy( - rdc_handle_, request->field_group_id()); - reply->set_status(result); + rdc_status_t result = rdc_group_field_destroy(rdc_handle_, request->field_group_id()); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::WatchFields( - ::grpc::ServerContext* context, - const ::rdc::WatchFieldsRequest* request, - ::rdc::WatchFieldsResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::WatchFields(::grpc::ServerContext* context, + const ::rdc::WatchFieldsRequest* request, + ::rdc::WatchFieldsResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_status_t result = rdc_field_watch( - rdc_handle_, request->group_id(), request->field_group_id(), - request->update_freq(), request->max_keep_age(), - request->max_keep_samples()); - reply->set_status(result); + rdc_status_t result = + rdc_field_watch(rdc_handle_, request->group_id(), request->field_group_id(), + request->update_freq(), request->max_keep_age(), request->max_keep_samples()); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } ::grpc::Status RdcAPIServiceImpl::GetLatestFieldValue( - ::grpc::ServerContext* context, - const ::rdc::GetLatestFieldValueRequest* request, - ::rdc::GetLatestFieldValueResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_field_value value; - rdc_status_t result = rdc_field_get_latest_value(rdc_handle_, - request->gpu_index(), static_cast(request->field_id()), - &value); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_field_id(value.field_id); - reply->set_rdc_status(value.status); - reply->set_ts(value.ts); - reply->set_type(static_cast<::rdc::GetLatestFieldValueResponse_FieldType> - (value.type)); - if (value.type == INTEGER) { - reply->set_l_int(value.value.l_int); - } else if (value.type == DOUBLE) { - reply->set_dbl(value.value.dbl); - } else if (value.type == STRING || value.type == BLOB) { - reply->set_str(value.value.str); - } + ::grpc::ServerContext* context, const ::rdc::GetLatestFieldValueRequest* request, + ::rdc::GetLatestFieldValueResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_field_value value; + rdc_status_t result = rdc_field_get_latest_value( + rdc_handle_, request->gpu_index(), static_cast(request->field_id()), &value); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_field_id(value.field_id); + reply->set_rdc_status(value.status); + reply->set_ts(value.ts); + reply->set_type(static_cast<::rdc::GetLatestFieldValueResponse_FieldType>(value.type)); + if (value.type == INTEGER) { + reply->set_l_int(value.value.l_int); + } else if (value.type == DOUBLE) { + reply->set_dbl(value.value.dbl); + } else if (value.type == STRING || value.type == BLOB) { + reply->set_str(value.value.str); + } + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetFieldSince( - ::grpc::ServerContext* context, - const ::rdc::GetFieldSinceRequest* request, - ::rdc::GetFieldSinceResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - - rdc_field_value value; - uint64_t next_timestamp; - rdc_status_t result = rdc_field_get_value_since(rdc_handle_, - request->gpu_index(), static_cast(request->field_id()), - request->since_time_stamp(), &next_timestamp, &value); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_next_since_time_stamp(next_timestamp); - reply->set_field_id(value.field_id); - reply->set_rdc_status(value.status); - reply->set_ts(value.ts); - reply->set_type(static_cast<::rdc::GetFieldSinceResponse_FieldType> - (value.type)); - if (value.type == INTEGER) { - reply->set_l_int(value.value.l_int); - } else if (value.type == DOUBLE) { - reply->set_dbl(value.value.dbl); - } else if (value.type == STRING || value.type == BLOB) { - std::string val_str(value.value.str); - size_t endpos = val_str.find_last_not_of(" "); - val_str[endpos + 1] = '\0'; - reply->set_str(val_str); - } +::grpc::Status RdcAPIServiceImpl::GetFieldSince(::grpc::ServerContext* context, + const ::rdc::GetFieldSinceRequest* request, + ::rdc::GetFieldSinceResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_field_value value; + uint64_t next_timestamp; + rdc_status_t result = rdc_field_get_value_since( + rdc_handle_, request->gpu_index(), static_cast(request->field_id()), + request->since_time_stamp(), &next_timestamp, &value); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_next_since_time_stamp(next_timestamp); + reply->set_field_id(value.field_id); + reply->set_rdc_status(value.status); + reply->set_ts(value.ts); + reply->set_type(static_cast<::rdc::GetFieldSinceResponse_FieldType>(value.type)); + if (value.type == INTEGER) { + reply->set_l_int(value.value.l_int); + } else if (value.type == DOUBLE) { + reply->set_dbl(value.value.dbl); + } else if (value.type == STRING || value.type == BLOB) { + std::string val_str(value.value.str); + size_t endpos = val_str.find_last_not_of(" "); + val_str[endpos + 1] = '\0'; + reply->set_str(val_str); + } + + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::UnWatchFields( - ::grpc::ServerContext* context, - const ::rdc::UnWatchFieldsRequest* request, - ::rdc::UnWatchFieldsResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::UnWatchFields(::grpc::ServerContext* context, + const ::rdc::UnWatchFieldsRequest* request, + ::rdc::UnWatchFieldsResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_status_t result = rdc_field_unwatch( - rdc_handle_, request->group_id(), request->field_group_id()); - reply->set_status(result); + rdc_status_t result = + rdc_field_unwatch(rdc_handle_, request->group_id(), request->field_group_id()); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::UpdateAllFields( - ::grpc::ServerContext* context, - const ::rdc::UpdateAllFieldsRequest* request, - ::rdc::UpdateAllFieldsResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::UpdateAllFields(::grpc::ServerContext* context, + const ::rdc::UpdateAllFieldsRequest* request, + ::rdc::UpdateAllFieldsResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_status_t result = rdc_field_update_all( - rdc_handle_, request->wait_for_update()); - reply->set_status(result); + rdc_status_t result = rdc_field_update_all(rdc_handle_, request->wait_for_update()); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::StartJobStats(::grpc::ServerContext* context, + const ::rdc::StartJobStatsRequest* request, + ::rdc::StartJobStatsResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_status_t result = + rdc_job_start_stats(rdc_handle_, request->group_id(), + const_cast(request->job_id().c_str()), request->update_freq()); + reply->set_status(result); -::grpc::Status RdcAPIServiceImpl::StartJobStats( - ::grpc::ServerContext* context, - const ::rdc::StartJobStatsRequest* request, - ::rdc::StartJobStatsResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - rdc_status_t result = rdc_job_start_stats( - rdc_handle_, request->group_id(), - const_cast(request->job_id().c_str()), - request->update_freq()); - reply->set_status(result); - - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::GetJobStats( - ::grpc::ServerContext* context, - const ::rdc::GetJobStatsRequest* request, - ::rdc::GetJobStatsResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::GetJobStats(::grpc::ServerContext* context, + const ::rdc::GetJobStatsRequest* request, + ::rdc::GetJobStatsResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_job_info_t job_info; - rdc_status_t result = rdc_job_get_stats( - rdc_handle_, - const_cast(request->job_id().c_str()), - &job_info); - - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - reply->set_num_gpus(job_info.num_gpus); - ::rdc::GpuUsageInfo* sinfo = reply->mutable_summary(); - copy_gpu_usage_info(job_info.summary, sinfo); - - for (uint32_t i = 0; i < job_info.num_gpus; i++) { - ::rdc::GpuUsageInfo* ginfo = reply->add_gpus(); - copy_gpu_usage_info(job_info.gpus[i], ginfo); - } + rdc_job_info_t job_info; + rdc_status_t result = + rdc_job_get_stats(rdc_handle_, const_cast(request->job_id().c_str()), &job_info); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + reply->set_num_gpus(job_info.num_gpus); + ::rdc::GpuUsageInfo* sinfo = reply->mutable_summary(); + copy_gpu_usage_info(job_info.summary, sinfo); + + for (uint32_t i = 0; i < job_info.num_gpus; i++) { + ::rdc::GpuUsageInfo* ginfo = reply->add_gpus(); + copy_gpu_usage_info(job_info.gpus[i], ginfo); + } + + return ::grpc::Status::OK; } bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, - ::rdc::GpuUsageInfo* target) { - if (target == nullptr) { - return false; - } + ::rdc::GpuUsageInfo* target) { + if (target == nullptr) { + return false; + } - target->set_gpu_id(src.gpu_id); - target->set_start_time(src.start_time); - target->set_end_time(src.end_time); - target->set_energy_consumed(src.energy_consumed); - target->set_max_gpu_memory_used(src.max_gpu_memory_used); - target->set_ecc_correct(src.ecc_correct); - target->set_ecc_uncorrect(src.ecc_uncorrect); + target->set_gpu_id(src.gpu_id); + target->set_start_time(src.start_time); + target->set_end_time(src.end_time); + target->set_energy_consumed(src.energy_consumed); + target->set_max_gpu_memory_used(src.max_gpu_memory_used); + target->set_ecc_correct(src.ecc_correct); + target->set_ecc_uncorrect(src.ecc_uncorrect); - ::rdc::JobStatsSummary* stats = target->mutable_power_usage(); - stats->set_max_value(src.power_usage.max_value); - stats->set_min_value(src.power_usage.min_value); - stats->set_average(src.power_usage.average); - stats->set_standard_deviation(src.power_usage.standard_deviation); + ::rdc::JobStatsSummary* stats = target->mutable_power_usage(); + stats->set_max_value(src.power_usage.max_value); + stats->set_min_value(src.power_usage.min_value); + stats->set_average(src.power_usage.average); + stats->set_standard_deviation(src.power_usage.standard_deviation); - stats = target->mutable_gpu_clock(); - stats->set_max_value(src.gpu_clock.max_value); - stats->set_min_value(src.gpu_clock.min_value); - stats->set_average(src.gpu_clock.average); - stats->set_standard_deviation(src.gpu_clock.standard_deviation); + stats = target->mutable_gpu_clock(); + stats->set_max_value(src.gpu_clock.max_value); + stats->set_min_value(src.gpu_clock.min_value); + stats->set_average(src.gpu_clock.average); + stats->set_standard_deviation(src.gpu_clock.standard_deviation); - stats = target->mutable_gpu_utilization(); - stats->set_max_value(src.gpu_utilization.max_value); - stats->set_min_value(src.gpu_utilization.min_value); - stats->set_average(src.gpu_utilization.average); - stats->set_standard_deviation(src.gpu_utilization.standard_deviation); + stats = target->mutable_gpu_utilization(); + stats->set_max_value(src.gpu_utilization.max_value); + stats->set_min_value(src.gpu_utilization.min_value); + stats->set_average(src.gpu_utilization.average); + stats->set_standard_deviation(src.gpu_utilization.standard_deviation); - stats = target->mutable_memory_utilization(); - stats->set_max_value(src.memory_utilization.max_value); - stats->set_min_value(src.memory_utilization.min_value); - stats->set_average(src.memory_utilization.average); - stats->set_standard_deviation(src.memory_utilization.standard_deviation); + stats = target->mutable_memory_utilization(); + stats->set_max_value(src.memory_utilization.max_value); + stats->set_min_value(src.memory_utilization.min_value); + stats->set_average(src.memory_utilization.average); + stats->set_standard_deviation(src.memory_utilization.standard_deviation); - stats = target->mutable_pcie_tx(); - stats->set_max_value(src.pcie_tx.max_value); - stats->set_min_value(src.pcie_tx.min_value); - stats->set_average(src.pcie_tx.average); - stats->set_standard_deviation(src.pcie_tx.standard_deviation); + stats = target->mutable_pcie_tx(); + stats->set_max_value(src.pcie_tx.max_value); + stats->set_min_value(src.pcie_tx.min_value); + stats->set_average(src.pcie_tx.average); + stats->set_standard_deviation(src.pcie_tx.standard_deviation); - stats = target->mutable_pcie_rx(); - stats->set_max_value(src.pcie_rx.max_value); - stats->set_min_value(src.pcie_rx.min_value); - stats->set_average(src.pcie_rx.average); - stats->set_standard_deviation(src.pcie_rx.standard_deviation); + stats = target->mutable_pcie_rx(); + stats->set_max_value(src.pcie_rx.max_value); + stats->set_min_value(src.pcie_rx.min_value); + stats->set_average(src.pcie_rx.average); + stats->set_standard_deviation(src.pcie_rx.standard_deviation); - stats = target->mutable_memory_clock(); - stats->set_max_value(src.memory_clock.max_value); - stats->set_min_value(src.memory_clock.min_value); - stats->set_average(src.memory_clock.average); - stats->set_standard_deviation(src.memory_clock.standard_deviation); + stats = target->mutable_memory_clock(); + stats->set_max_value(src.memory_clock.max_value); + stats->set_min_value(src.memory_clock.min_value); + stats->set_average(src.memory_clock.average); + stats->set_standard_deviation(src.memory_clock.standard_deviation); - stats = target->mutable_gpu_temperature(); - stats->set_max_value(src.gpu_temperature.max_value); - stats->set_min_value(src.gpu_temperature.min_value); - stats->set_average(src.gpu_temperature.average); - stats->set_standard_deviation(src.gpu_temperature.standard_deviation); + stats = target->mutable_gpu_temperature(); + stats->set_max_value(src.gpu_temperature.max_value); + stats->set_min_value(src.gpu_temperature.min_value); + stats->set_average(src.gpu_temperature.average); + stats->set_standard_deviation(src.gpu_temperature.standard_deviation); - return true; + return true; } +::grpc::Status RdcAPIServiceImpl::StopJobStats(::grpc::ServerContext* context, + const ::rdc::StopJobStatsRequest* request, + ::rdc::StopJobStatsResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } -::grpc::Status RdcAPIServiceImpl::StopJobStats( - ::grpc::ServerContext* context, - const ::rdc::StopJobStatsRequest* request, - ::rdc::StopJobStatsResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } + rdc_status_t result = + rdc_job_stop_stats(rdc_handle_, const_cast(request->job_id().c_str())); + reply->set_status(result); - rdc_status_t result = rdc_job_stop_stats( - rdc_handle_, - const_cast(request->job_id().c_str())); - reply->set_status(result); - - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::RemoveJob( - ::grpc::ServerContext* context, - const ::rdc::RemoveJobRequest* request, - ::rdc::RemoveJobResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - rdc_status_t result = rdc_job_remove( - rdc_handle_, const_cast(request->job_id().c_str())); - reply->set_status(result); +::grpc::Status RdcAPIServiceImpl::RemoveJob(::grpc::ServerContext* context, + const ::rdc::RemoveJobRequest* request, + ::rdc::RemoveJobResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_status_t result = rdc_job_remove(rdc_handle_, const_cast(request->job_id().c_str())); + reply->set_status(result); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::RemoveAllJob(::grpc::ServerContext* context, + const ::rdc::Empty* request, + ::rdc::RemoveAllJobResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_status_t result = rdc_job_remove_all(rdc_handle_); + reply->set_status(result); -::grpc::Status RdcAPIServiceImpl::RemoveAllJob( - ::grpc::ServerContext* context, - const ::rdc::Empty* request, - ::rdc::RemoveAllJobResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } - rdc_status_t result = rdc_job_remove_all(rdc_handle_); - reply->set_status(result); - - - return ::grpc::Status::OK; + return ::grpc::Status::OK; } -::grpc::Status RdcAPIServiceImpl::DiagnosticRun( - ::grpc::ServerContext* context, - const ::rdc::DiagnosticRunRequest* request, - ::rdc::DiagnosticRunResponse* reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } +::grpc::Status RdcAPIServiceImpl::DiagnosticRun(::grpc::ServerContext* context, + const ::rdc::DiagnosticRunRequest* request, + ::rdc::DiagnosticRunResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_diag_response_t diag_response; - rdc_status_t result = rdc_diagnostic_run( - rdc_handle_, - request->group_id(), - static_cast(request->level()), - &diag_response); - - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - - ::rdc::DiagnosticResponse* to_response = reply->mutable_response(); - to_response->set_results_count(diag_response.results_count); - - for (uint32_t i=0 ; i < diag_response.results_count; i++) { - const rdc_diag_test_result_t& test_result = - diag_response.diag_info[i]; - ::rdc::DiagnosticTestResult* to_diag_info = - to_response->add_diag_info(); - - to_diag_info->set_status(test_result.status); - - // details - auto to_details = to_diag_info->mutable_details(); - to_details->set_code(test_result.details.code); - to_details->set_msg(test_result.details.msg); - - to_diag_info->set_test_case( - static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase> - (test_result.test_case)); - to_diag_info->set_per_gpu_result_count( - test_result.per_gpu_result_count); - - // gpu_results - for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { - auto to_result = to_diag_info->add_gpu_results(); - const rdc_diag_per_gpu_result_t& cur_result = - test_result.gpu_results[j]; - to_result->set_gpu_index(cur_result.gpu_index); - auto to_per_detail = to_result->mutable_gpu_result(); - to_per_detail->set_code(cur_result.gpu_result.code); - to_per_detail->set_msg(cur_result.gpu_result.msg); - } - to_diag_info->set_info(test_result.info); - } + rdc_diag_response_t diag_response; + rdc_status_t result = + rdc_diagnostic_run(rdc_handle_, request->group_id(), + static_cast(request->level()), &diag_response); + reply->set_status(result); + if (result != RDC_ST_OK) { return ::grpc::Status::OK; + } + + ::rdc::DiagnosticResponse* to_response = reply->mutable_response(); + to_response->set_results_count(diag_response.results_count); + + for (uint32_t i = 0; i < diag_response.results_count; i++) { + const rdc_diag_test_result_t& test_result = diag_response.diag_info[i]; + ::rdc::DiagnosticTestResult* to_diag_info = to_response->add_diag_info(); + + to_diag_info->set_status(test_result.status); + + // details + auto to_details = to_diag_info->mutable_details(); + to_details->set_code(test_result.details.code); + to_details->set_msg(test_result.details.msg); + + to_diag_info->set_test_case( + static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase>(test_result.test_case)); + to_diag_info->set_per_gpu_result_count(test_result.per_gpu_result_count); + + // gpu_results + for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) { + auto to_result = to_diag_info->add_gpu_results(); + const rdc_diag_per_gpu_result_t& cur_result = test_result.gpu_results[j]; + to_result->set_gpu_index(cur_result.gpu_index); + auto to_per_detail = to_result->mutable_gpu_result(); + to_per_detail->set_code(cur_result.gpu_result.code); + to_per_detail->set_msg(cur_result.gpu_result.msg); + } + to_diag_info->set_info(test_result.info); + } + + return ::grpc::Status::OK; } ::grpc::Status RdcAPIServiceImpl::DiagnosticTestCaseRun( - ::grpc::ServerContext *context, - const ::rdc::DiagnosticTestCaseRunRequest *request, - ::rdc::DiagnosticTestCaseRunResponse *reply) { - (void)(context); - if (!reply || !request) { - return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); - } + ::grpc::ServerContext* context, const ::rdc::DiagnosticTestCaseRunRequest* request, + ::rdc::DiagnosticTestCaseRunResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } - rdc_diag_test_result_t test_result; - rdc_status_t result = rdc_test_case_run( - rdc_handle_, - request->group_id(), - static_cast(request->test_case()), - &test_result); + rdc_diag_test_result_t test_result; + rdc_status_t result = + rdc_test_case_run(rdc_handle_, request->group_id(), + static_cast(request->test_case()), &test_result); - reply->set_status(result); - if (result != RDC_ST_OK) { - return ::grpc::Status::OK; - } - ::rdc::DiagnosticTestResult *to_diag_info = - reply->mutable_result(); - to_diag_info->set_status(test_result.status); + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + ::rdc::DiagnosticTestResult* to_diag_info = reply->mutable_result(); + to_diag_info->set_status(test_result.status); - // details - auto to_details = to_diag_info->mutable_details(); - to_details->set_code(test_result.details.code); - to_details->set_msg(test_result.details.msg); + // details + auto to_details = to_diag_info->mutable_details(); + to_details->set_code(test_result.details.code); + to_details->set_msg(test_result.details.msg); - to_diag_info->set_test_case( - static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase>( - test_result.test_case)); - to_diag_info->set_per_gpu_result_count( - test_result.per_gpu_result_count); + to_diag_info->set_test_case( + static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase>(test_result.test_case)); + to_diag_info->set_per_gpu_result_count(test_result.per_gpu_result_count); - // gpu_results - for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) { - auto to_result = to_diag_info->add_gpu_results(); - const rdc_diag_per_gpu_result_t &cur_result = - test_result.gpu_results[j]; - to_result->set_gpu_index(cur_result.gpu_index); - auto to_per_detail = to_result->mutable_gpu_result(); - to_per_detail->set_code(cur_result.gpu_result.code); - to_per_detail->set_msg(cur_result.gpu_result.msg); - } - to_diag_info->set_info(test_result.info); + // gpu_results + for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) { + auto to_result = to_diag_info->add_gpu_results(); + const rdc_diag_per_gpu_result_t& cur_result = test_result.gpu_results[j]; + to_result->set_gpu_index(cur_result.gpu_index); + auto to_per_detail = to_result->mutable_gpu_result(); + to_per_detail->set_code(cur_result.gpu_result.code); + to_per_detail->set_msg(cur_result.gpu_result.msg); + } + to_diag_info->set_info(test_result.info); - return ::grpc::Status::OK; + return ::grpc::Status::OK; } } // namespace rdc } // namespace amd - - diff --git a/projects/rdc/server/src/rdc_rsmi_service.cc b/projects/rdc/server/src/rdc_rsmi_service.cc old mode 100755 new mode 100644 index face11402c..431586f09a --- a/projects/rdc/server/src/rdc_rsmi_service.cc +++ b/projects/rdc/server/src/rdc_rsmi_service.cc @@ -20,23 +20,23 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc/rdc_rsmi_service.h" + #include #include +#include #include #include #include -#include #include "rdc.grpc.pb.h" // NOLINT #include "rocm_smi/rocm_smi.h" -#include "rdc/rdc_rsmi_service.h" namespace amd { namespace rdc { -RsmiServiceImpl::RsmiServiceImpl():rsmi_initialized_(false) { -} +RsmiServiceImpl::RsmiServiceImpl() : rsmi_initialized_(false) {} RsmiServiceImpl::~RsmiServiceImpl() { if (rsmi_initialized_) { @@ -48,14 +48,12 @@ RsmiServiceImpl::~RsmiServiceImpl() { // rsmi and rdc currently happen to have a 1-to-1 mapping, but // have this function in case that changes -static rsmi_temperature_metric_t - rdc_temp2rsmi_temp(::rdc::GetTemperatureRequest_TemperatureMetric - rdc_temp) { +static rsmi_temperature_metric_t rdc_temp2rsmi_temp( + ::rdc::GetTemperatureRequest_TemperatureMetric rdc_temp) { return static_cast(rdc_temp); } -rsmi_status_t -RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) { +rsmi_status_t RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) { rsmi_status_t rsmi_ret = rsmi_init(rsmi_init_flags); if (rsmi_ret != RSMI_STATUS_SUCCESS) { std::cout << "rsmi_init() returned error" << std::endl; @@ -65,10 +63,9 @@ RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) { return rsmi_ret; } -::grpc::Status -RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context, - const ::rdc::GetNumDevicesRequest* request, - ::rdc::GetNumDevicesResponse* reply) { +::grpc::Status RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context, + const ::rdc::GetNumDevicesRequest* request, + ::rdc::GetNumDevicesResponse* reply) { assert(reply != nullptr); uint32_t num_devices; @@ -88,65 +85,58 @@ RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context, return ::grpc::Status::OK; } -::grpc::Status -RsmiServiceImpl::GetTemperature(::grpc::ServerContext* context, - const ::rdc::GetTemperatureRequest* request, - ::rdc::GetTemperatureResponse* response) { +::grpc::Status RsmiServiceImpl::GetTemperature(::grpc::ServerContext* context, + const ::rdc::GetTemperatureRequest* request, + ::rdc::GetTemperatureResponse* response) { (void)context; // Quiet warning for now; assert(response != nullptr); int64_t temperature; - rsmi_status_t ret = rsmi_dev_temp_metric_get(request->dv_ind(), - request->sensor_type(), rdc_temp2rsmi_temp(request->metric()), - &temperature); + rsmi_status_t ret = rsmi_dev_temp_metric_get(request->dv_ind(), request->sensor_type(), + rdc_temp2rsmi_temp(request->metric()), &temperature); response->set_temperature(temperature); response->set_ret_val(ret); return ::grpc::Status::OK; } -::grpc::Status -RsmiServiceImpl::GetFanRpms(::grpc::ServerContext* context, - const ::rdc::GetFanRpmsRequest* request, - ::rdc::GetFanRpmsResponse* response) { +::grpc::Status RsmiServiceImpl::GetFanRpms(::grpc::ServerContext* context, + const ::rdc::GetFanRpmsRequest* request, + ::rdc::GetFanRpmsResponse* response) { (void)context; // Quiet warning for now; assert(response != nullptr); int64_t rpms; - rsmi_status_t ret = rsmi_dev_fan_rpms_get(request->dv_ind(), - request->sensor_ind(), &rpms); + rsmi_status_t ret = rsmi_dev_fan_rpms_get(request->dv_ind(), request->sensor_ind(), &rpms); response->set_rpms(rpms); response->set_ret_val(ret); return ::grpc::Status::OK; } -::grpc::Status -RsmiServiceImpl::GetFanSpeed(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedRequest* request, - ::rdc::GetFanSpeedResponse* response) { +::grpc::Status RsmiServiceImpl::GetFanSpeed(::grpc::ServerContext* context, + const ::rdc::GetFanSpeedRequest* request, + ::rdc::GetFanSpeedResponse* response) { (void)context; // Quiet warning for now; assert(response != nullptr); int64_t speed; - rsmi_status_t ret = rsmi_dev_fan_speed_get(request->dv_ind(), - request->sensor_ind(), &speed); + rsmi_status_t ret = rsmi_dev_fan_speed_get(request->dv_ind(), request->sensor_ind(), &speed); response->set_speed(speed); response->set_ret_val(ret); return ::grpc::Status::OK; } -::grpc::Status -RsmiServiceImpl::GetFanSpeedMax(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedMaxRequest* request, - ::rdc::GetFanSpeedMaxResponse* response) { +::grpc::Status RsmiServiceImpl::GetFanSpeedMax(::grpc::ServerContext* context, + const ::rdc::GetFanSpeedMaxRequest* request, + ::rdc::GetFanSpeedMaxResponse* response) { (void)context; // Quiet warning for now; assert(response != nullptr); uint64_t max_speed; - rsmi_status_t ret = rsmi_dev_fan_speed_max_get(request->dv_ind(), - request->sensor_ind(), &max_speed); + rsmi_status_t ret = + rsmi_dev_fan_speed_max_get(request->dv_ind(), request->sensor_ind(), &max_speed); response->set_max_speed(max_speed); response->set_ret_val(ret); diff --git a/projects/rdc/server/src/rdc_server_main.cc b/projects/rdc/server/src/rdc_server_main.cc old mode 100755 new mode 100644 index 11526c1d4b..d3b56f5555 --- a/projects/rdc/server/src/rdc_server_main.cc +++ b/projects/rdc/server/src/rdc_server_main.cc @@ -20,29 +20,31 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc/rdc_server_main.h" + #include #include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include #include +#include +#include +#include +#include +#include + +#include #include #include #include -#include -#include "rdc.grpc.pb.h" // NOLINT -#include "rocm_smi/rocm_smi.h" -#include "rdc/rdc_server_main.h" -#include "rdc/rdc_rsmi_service.h" -#include "rdc/rdc_api_service.h" #include "common/rdc_capabilities.h" #include "common/rdc_utils.h" +#include "rdc.grpc.pb.h" // NOLINT +#include "rdc/rdc_api_service.h" +#include "rdc/rdc_rsmi_service.h" +#include "rocm_smi/rocm_smi.h" // TODO(cfreehil): // The following need to be made configurable (e.g., from YAML): @@ -56,43 +58,33 @@ THE SOFTWARE. static bool sShutDownServer = false; static bool sRestartServer = false; -static const char *kDaemonName = "rdcd"; -static const char *kRDCDHomeDir = "/"; -static const char *kDaemonLockFileRoot = "/var/run/rdcd.lock"; -static const char *kDaemonLockFile = "/tmp/rdcd.lock"; +static const char* kDaemonName = "rdcd"; +static const char* kRDCDHomeDir = "/"; +static const char* kDaemonLockFileRoot = "/var/run/rdcd.lock"; +static const char* kDaemonLockFile = "/tmp/rdcd.lock"; // Pinned certificates -static const char * kDefaultRDCServerCertPinPath = - "/etc/rdc/server/rdc_server.crt"; -static const char * kDefaultRDCServerKeyPinPath = - "/etc/rdc/server/private/rdc_server.key"; -static const char * kDefaultRDCClientCertPinPath = - "/etc/rdc/client/rdc_client.crt"; +static const char* kDefaultRDCServerCertPinPath = "/etc/rdc/server/rdc_server.crt"; +static const char* kDefaultRDCServerKeyPinPath = "/etc/rdc/server/private/rdc_server.key"; +static const char* kDefaultRDCClientCertPinPath = "/etc/rdc/client/rdc_client.crt"; // PKI certificates -static const char * kDefaultRDCServerCertKeyPkiPath = - "/etc/rdc/server/private/rdc_server_cert.key"; -static const char * kDefaultRDCServerCertPemPkiPath = - "/etc/rdc/server/certs/rdc_server_cert.pem"; -static const char * kDefaultRDCClientCACertPemPkiPath = - "/etc/rdc/client/certs/rdc_cacert.pem"; +static const char* kDefaultRDCServerCertKeyPkiPath = "/etc/rdc/server/private/rdc_server_cert.key"; +static const char* kDefaultRDCServerCertPemPkiPath = "/etc/rdc/server/certs/rdc_server_cert.pem"; +static const char* kDefaultRDCClientCACertPemPkiPath = "/etc/rdc/client/certs/rdc_cacert.pem"; -static const char *kDefaultListenAddress = "0.0.0.0"; -static const char *kDefaultListenPort = "50051"; +static const char* kDefaultListenAddress = "0.0.0.0"; +static const char* kDefaultListenPort = "50051"; static const uint32_t kRSMIUMask = 027; -RDCServer::RDCServer() : - secure_creds_(false), rsmi_service_(nullptr), rdc_admin_service_(nullptr) { -} - -RDCServer::~RDCServer() { -} +RDCServer::RDCServer() + : secure_creds_(false), rsmi_service_(nullptr), rdc_admin_service_(nullptr) {} +RDCServer::~RDCServer() {} // TODO(cfreehil): resolve here command line options with // (future) config file options -void -RDCServer::Initialize(RdcdCmdLineOpts *cl) { +void RDCServer::Initialize(RdcdCmdLineOpts* cl) { cmd_line_ = cl; server_address_ = cmd_line_->listen_address; server_address_ += ":"; @@ -102,7 +94,7 @@ RDCServer::Initialize(RdcdCmdLineOpts *cl) { log_debug_ = cmd_line_->log_dbg; } -static int ConstructSSLOptsPin(grpc::SslServerCredentialsOptions *ssl_opts) { +static int ConstructSSLOptsPin(grpc::SslServerCredentialsOptions* ssl_opts) { assert(ssl_opts != nullptr); if (ssl_opts == nullptr) { return -EINVAL; @@ -133,14 +125,13 @@ static int ConstructSSLOptsPin(grpc::SslServerCredentialsOptions *ssl_opts) { } grpc::SslServerCredentialsOptions::PemKeyCertPair pkcp = {ser_key, ser_crt}; - ssl_opts->client_certificate_request = - GRPC_SSL_REQUEST_AND_REQUIRE_CLIENT_CERTIFICATE_AND_VERIFY; + ssl_opts->client_certificate_request = GRPC_SSL_REQUEST_AND_REQUIRE_CLIENT_CERTIFICATE_AND_VERIFY; ssl_opts->pem_root_certs = cli_crt; ssl_opts->pem_key_cert_pairs.push_back(pkcp); return 0; } -static int ConstructSSLOptsPKI(grpc::SslServerCredentialsOptions *ssl_opts) { +static int ConstructSSLOptsPKI(grpc::SslServerCredentialsOptions* ssl_opts) { assert(ssl_opts != nullptr); if (ssl_opts == nullptr) { return -EINVAL; @@ -171,15 +162,13 @@ static int ConstructSSLOptsPKI(grpc::SslServerCredentialsOptions *ssl_opts) { } grpc::SslServerCredentialsOptions::PemKeyCertPair pkcp = {ser_key, ser_crt}; - ssl_opts->client_certificate_request = - GRPC_SSL_REQUEST_AND_REQUIRE_CLIENT_CERTIFICATE_AND_VERIFY; + ssl_opts->client_certificate_request = GRPC_SSL_REQUEST_AND_REQUIRE_CLIENT_CERTIFICATE_AND_VERIFY; ssl_opts->pem_root_certs = cli_crt; ssl_opts->pem_key_cert_pairs.push_back(pkcp); return 0; } -void -RDCServer::Run() { +void RDCServer::Run() { ::grpc::ServerBuilder builder; int ret; if (secure_creds_) { @@ -192,15 +181,12 @@ RDCServer::Run() { } if (ret) { - std::cerr << "Failed to process OpenSSL keys and certificates. Errno: " - << -ret << std::endl; + std::cerr << "Failed to process OpenSSL keys and certificates. Errno: " << -ret << std::endl; return; } - builder.AddListeningPort(server_address_, - grpc::SslServerCredentials(ssl_opts)); + builder.AddListeningPort(server_address_, grpc::SslServerCredentials(ssl_opts)); } else { - builder.AddListeningPort(server_address_, - grpc::InsecureServerCredentials()); + builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials()); } // Register services as the instances through which we'll communicate with @@ -240,14 +226,13 @@ RDCServer::Run() { server_ = builder.BuildAndStart(); std::cout << "Server listening on " << server_address_.c_str() << std::endl; - std::cout << "Accepting " << - (secure_creds_ ? "Authenticated" : "Unauthenticated") << - " connections only." << std::endl; + std::cout << "Accepting " << (secure_creds_ ? "Authenticated" : "Unauthenticated") + << " connections only." << std::endl; server_->Wait(); } static void HandleSignal(int sig) { - std::cout << "Caught signal " << sig << std::endl; + std::cout << "Caught signal " << sig << std::endl; // For most signals, we will want to exit, so make that the default case // Handle the other signals specifically. @@ -259,9 +244,9 @@ static void HandleSignal(int sig) { // Grpc doesn't seem to handle stopping and restarting well, so // user must manually do these steps -// case SIGHUP: -// sRestartServer = true; -// break; + // case SIGHUP: + // sRestartServer = true; + // break; default: std::cerr << "Unexpected signal caught" << std::endl; @@ -274,8 +259,7 @@ static void InitializeSignalHandling(void) { signal(SIGTERM, HandleSignal); } -static int -FileOwner(const char *fn, std::string *owner) { +static int FileOwner(const char* fn, std::string* owner) { struct stat info; int ret; @@ -289,7 +273,7 @@ FileOwner(const char *fn, std::string *owner) { return errno; } struct passwd pw; - struct passwd *result; + struct passwd* result; char buf[20]; ret = getpwuid_r(info.st_uid, &pw, buf, 20, &result); @@ -302,8 +286,7 @@ FileOwner(const char *fn, std::string *owner) { return 0; } -void -RDCServer::ShutDown(void) { +void RDCServer::ShutDown(void) { server_->Shutdown(); if (rsmi_service_) { @@ -322,19 +305,19 @@ RDCServer::ShutDown(void) { } } -static void * ProcessSignalLoop(void *server_ptr) { +static void* ProcessSignalLoop(void* server_ptr) { assert(server_ptr != nullptr); - RDCServer *server = reinterpret_cast(server_ptr); + RDCServer* server = reinterpret_cast(server_ptr); while (1) { if (sShutDownServer) { - std::cout << "Shutting down RDC Server." << std::endl; + std::cout << "Shutting down RDC Server." << std::endl; server->ShutDown(); // We will need to add shutdown of any completion queues // here, when/if we add them break; } else if (sRestartServer) { - std::cout << "Re-starting RDC Server." << std::endl; + std::cout << "Re-starting RDC Server." << std::endl; // We will need to add shutdown of any completion queues // here, when/if we add them server->ShutDown(); @@ -385,7 +368,7 @@ static bool FileIsLocked(std::string fn) { } static void ExitIfAlreadyRunning(bool is_root) { - const char *lock_fn; + const char* lock_fn; int lock_fh; std::string lf_user(kDaemonLockFile); std::string lf_root(kDaemonLockFileRoot); @@ -395,8 +378,7 @@ static void ExitIfAlreadyRunning(bool is_root) { bool is_locked = FileIsLocked(lock_file); if (is_locked) { - std::cerr << "File " << lock_file << - " is locked. Is rdcd already running?" << std::endl; + std::cerr << "File " << lock_file << " is locked. Is rdcd already running?" << std::endl; exit(1); } }; @@ -411,7 +393,7 @@ static void ExitIfAlreadyRunning(bool is_root) { } // Temporarily adjust file-mask to create file with right permissions umask(023); - lock_fh = open(lock_fn, O_RDWR|O_CREAT, 0644); + lock_fh = open(lock_fn, O_RDWR | O_CREAT, 0644); if (lock_fh < 0) { std::string user; @@ -420,9 +402,10 @@ static void ExitIfAlreadyRunning(bool is_root) { perror("Failed to determine owner of lock file."); exit(ret); } - std::cerr << "Failed to open file lock:" << lock_fn << " owned by user: " - << user << ". If starting rdcd as a different user, delete this " - "lock-file first." << std::endl; + std::cerr << "Failed to open file lock:" << lock_fn << " owned by user: " << user + << ". If starting rdcd as a different user, delete this " + "lock-file first." + << std::endl; // asserting below since this should have been prevented in main() assert(!"Unexpected user invoking rdcd"); exit(1); @@ -439,8 +422,7 @@ static void ExitIfAlreadyRunning(bool is_root) { assert(static_cast(fsz) == pid_str.size()); } -static void -MakeDaemon(bool is_root) { +static void MakeDaemon(bool is_root) { int fd0; struct rlimit max_files; @@ -474,13 +456,13 @@ MakeDaemon(bool is_root) { // chdir to dir that will always be available if (chdir(kRDCDHomeDir) < 0) { - std::cerr << "Failed to change directory to " < specify address on which to listen; " - "default is 0.0.0.0\n" - "--port, -p specify port on which to listen; " - "default is to listen on port 50051\n" - "--unauth_comm, -u don't do authentication with communications" - " with client. When this flag is not specified, by default, " - "PKI authentication is used\n" - "--pinned_cert, -i used \"pinned\" certificates instead of PKI " - "authentication. This is for test purposes.\n" - "--debug, -d output debug messages\n" - "--help, -h print this message\n"; + std::cout << "Optional rdctst Arguments:\n" + "--address, -a specify address on which to listen; " + "default is 0.0.0.0\n" + "--port, -p specify port on which to listen; " + "default is to listen on port 50051\n" + "--unauth_comm, -u don't do authentication with communications" + " with client. When this flag is not specified, by default, " + "PKI authentication is used\n" + "--pinned_cert, -i used \"pinned\" certificates instead of PKI " + "authentication. This is for test purposes.\n" + "--debug, -d output debug messages\n" + "--help, -h print this message\n"; } -uint32_t ProcessCmdline(RdcdCmdLineOpts* cmdl_opts, - int arg_cnt, char** arg_list) { +uint32_t ProcessCmdline(RdcdCmdLineOpts* cmdl_opts, int arg_cnt, char** arg_list) { int a; int ind = -1; @@ -555,8 +533,7 @@ uint32_t ProcessCmdline(RdcdCmdLineOpts* cmdl_opts, switch (a) { case 'a': if (!amd::rdc::IsIP(optarg)) { - std::cerr << "\"" << optarg << - "\" is not a valid IP address." << std::endl; + std::cerr << "\"" << optarg << "\" is not a valid IP address." << std::endl; return -1; } cmdl_opts->listen_address = optarg; @@ -564,8 +541,7 @@ uint32_t ProcessCmdline(RdcdCmdLineOpts* cmdl_opts, case 'p': if (!amd::rdc::IsNumber(optarg)) { - std::cerr << "\"" << optarg << - "\" is not a valid port number." << std::endl; + std::cerr << "\"" << optarg << "\" is not a valid port number." << std::endl; return -1; } cmdl_opts->listen_port = optarg; @@ -588,8 +564,7 @@ uint32_t ProcessCmdline(RdcdCmdLineOpts* cmdl_opts, exit(0); default: - std::cout << "Unknown command line option: \"" << a << - "\". Ignoring..." << std::endl; + std::cout << "Unknown command line option: \"" << a << "\". Ignoring..." << std::endl; PrintHelp(); return 0; } @@ -597,14 +572,13 @@ uint32_t ProcessCmdline(RdcdCmdLineOpts* cmdl_opts, // Check for incompatibilities if (cmdl_opts->use_pinned_certs && cmdl_opts->no_authentication) { - std::cerr << "--pinned_cert and --unauth_comm are incompatible options." - << std::endl; + std::cerr << "--pinned_cert and --unauth_comm are incompatible options." << std::endl; return -1; } return 0; } -static void init_cmd_line_opts(RdcdCmdLineOpts *opts) { +static void init_cmd_line_opts(RdcdCmdLineOpts* opts) { assert(opts != nullptr); opts->listen_address = kDefaultListenAddress; opts->listen_port = kDefaultListenPort; @@ -631,22 +605,20 @@ int main(int argc, char** argv) { // Can read the certificates and private key when authentication. if (!cmd_line_opts.no_authentication) { - if (cmd_line_opts.use_pinned_certs && - (access(kDefaultRDCServerCertPinPath, R_OK) != 0 || - access(kDefaultRDCServerKeyPinPath, R_OK) != 0 || - access(kDefaultRDCClientCertPinPath, R_OK) != 0)) { - std::cerr << "The user needs read access to the pinned " - << "certificates and private key." << std::endl; - return 1; + if (cmd_line_opts.use_pinned_certs && (access(kDefaultRDCServerCertPinPath, R_OK) != 0 || + access(kDefaultRDCServerKeyPinPath, R_OK) != 0 || + access(kDefaultRDCClientCertPinPath, R_OK) != 0)) { + std::cerr << "The user needs read access to the pinned " + << "certificates and private key." << std::endl; + return 1; } - if (!cmd_line_opts.use_pinned_certs && - (access(kDefaultRDCServerCertKeyPkiPath, R_OK) != 0 || - access(kDefaultRDCServerCertPemPkiPath, R_OK) != 0 || - access(kDefaultRDCClientCACertPemPkiPath, R_OK) != 0)) { - std::cerr << "The user needs read access to the PKI " - << "certificates and private key." << std::endl; - return 1; + if (!cmd_line_opts.use_pinned_certs && (access(kDefaultRDCServerCertKeyPkiPath, R_OK) != 0 || + access(kDefaultRDCServerCertPemPkiPath, R_OK) != 0 || + access(kDefaultRDCClientCACertPemPkiPath, R_OK) != 0)) { + std::cerr << "The user needs read access to the PKI " + << "certificates and private key." << std::endl; + return 1; } } @@ -656,23 +628,20 @@ int main(int argc, char** argv) { bool cap_enabled; - err = - ::amd::rdc::GetCapability(CAP_DAC_OVERRIDE, CAP_EFFECTIVE, &cap_enabled); + err = ::amd::rdc::GetCapability(CAP_DAC_OVERRIDE, CAP_EFFECTIVE, &cap_enabled); if (err) { std::cerr << "Failed to get capability" << std::endl; return 1; } if (cap_enabled) { - err = - amd::rdc::GetCapability(CAP_DAC_OVERRIDE, CAP_PERMITTED, &cap_enabled); + err = amd::rdc::GetCapability(CAP_DAC_OVERRIDE, CAP_PERMITTED, &cap_enabled); if (err) { std::cerr << "Failed to get capability" << std::endl; return 1; } if (!cap_enabled) { - std::cerr << - "CAP_DAC_OVERRIDE CAP_PERMITTED is not enabled" << std::endl; + std::cerr << "CAP_DAC_OVERRIDE CAP_PERMITTED is not enabled" << std::endl; } } else { std::cerr << "CAP_DAC_OVERRIDE CAP_EFFECTIVE is not enabled." << std::endl; @@ -685,28 +654,23 @@ int main(int argc, char** argv) { // relax this restriction if some new feature requires it. err = amd::rdc::ModifyCapability(CAP_DAC_OVERRIDE, CAP_INHERITABLE, false); if (err) { - std::cerr << "Failed to disable CAP_DAC_OVERRIDE, CAP_INHERITABLE" << - std::endl; + std::cerr << "Failed to disable CAP_DAC_OVERRIDE, CAP_INHERITABLE" << std::endl; return 1; } // By default, disable CAP_DAC_OVERRIDE. Turn on, when needed. err = amd::rdc::ModifyCapability(CAP_DAC_OVERRIDE, CAP_EFFECTIVE, false); if (err) { - std::cerr << "Failed to disable CAP_DAC_OVERRIDE, CAP_EFFECTIVE" << - std::endl; + std::cerr << "Failed to disable CAP_DAC_OVERRIDE, CAP_EFFECTIVE" << std::endl; return 1; } // Create a thread to handle signals to shutdown gracefully pthread_t sig_listen_thread; - int thr_ret = pthread_create(&sig_listen_thread, NULL, - ProcessSignalLoop, &rdc_server); + int thr_ret = pthread_create(&sig_listen_thread, NULL, ProcessSignalLoop, &rdc_server); if (thr_ret) { - std::cerr << - "Failed to create ProcessSignalLoop. pthread_create() returned " << - thr_ret; + std::cerr << "Failed to create ProcessSignalLoop. pthread_create() returned " << thr_ret; return 1; } @@ -723,13 +687,11 @@ int main(int argc, char** argv) { // don't fail if it doesn't succeed if (thr_ret != 0) { - std::cerr << - "Failed to terminate ProcessSignalLoop. pthread_join() returned " << - thr_ret; + std::cerr << "Failed to terminate ProcessSignalLoop. pthread_join() returned " << thr_ret; } if (sShutDownServer) { - std::cout << "RDC server successfully shut down." << std::endl; + std::cout << "RDC server successfully shut down." << std::endl; return 0; } else { std::cerr << "RDC server failed to start." << std::endl; diff --git a/projects/rdc/tests/example/rdc_client_test.cc b/projects/rdc/tests/example/rdc_client_test.cc old mode 100755 new mode 100644 index c81dc0d853..4c380ae31f --- a/projects/rdc/tests/example/rdc_client_test.cc +++ b/projects/rdc/tests/example/rdc_client_test.cc @@ -21,26 +21,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc/rdc_client.h" + #include +#include #include -#include "rdc/rdc_client.h" #include "rocm_smi/rocm_smi.h" -#define CHK_RET_STATUS(RET) \ - if ((RET) != RDC_STATUS_SUCCESS) { \ - const char *err_msg_str; \ - (void)rdc_status_string((RET), &err_msg_str); \ - std::cout << "rdc call returned error: " << (RET) << ":\"" << \ - err_msg_str << "\"" << std::endl; \ +#define CHK_RET_STATUS(RET) \ + if ((RET) != RDC_STATUS_SUCCESS) { \ + const char* err_msg_str; \ + (void)rdc_status_string((RET), &err_msg_str); \ + std::cout << "rdc call returned error: " << (RET) << ":\"" << err_msg_str << "\"" \ + << std::endl; \ } -#define CHK_RET_STATUS_CONT(RET) \ - if ((RET) != RDC_STATUS_SUCCESS) { \ - std::cout << "rdc call returned error: " << (RET) << std::endl; \ - continue; \ +#define CHK_RET_STATUS_CONT(RET) \ + if ((RET) != RDC_STATUS_SUCCESS) { \ + std::cout << "rdc call returned error: " << (RET) << std::endl; \ + continue; \ } int main(int argc, char** argv) { @@ -58,11 +59,9 @@ int main(int argc, char** argv) { serv_port = argv[2]; } - std::cout << "Attempting to create channel to " << serv_host << ":" << - serv_port << std::endl; + std::cout << "Attempting to create channel to " << serv_host << ":" << serv_port << std::endl; - ret = rdc_channel_create(&server_ch, serv_host.c_str(), serv_port.c_str(), - false); + ret = rdc_channel_create(&server_ch, serv_host.c_str(), serv_port.c_str(), false); CHK_RET_STATUS(ret) std::cout << "Successfully created channel" << std::endl; @@ -80,23 +79,20 @@ int main(int argc, char** argv) { std::cout << "Getting number of gpus at server..." << std::endl; ret = rdc_num_gpus_get(server_ch, &num_gpu); CHK_RET_STATUS(ret) - std::cout << "Number of GPUs at server is " << server_ch << - num_gpu << std::endl; + std::cout << "Number of GPUs at server is " << server_ch << num_gpu << std::endl; for (uint32_t dv_ind = 0; dv_ind < num_gpu; ++dv_ind) { std::cout << "Info for Device " << dv_ind << ":" << std::endl; std::cout << "\tGetting temperature..." << std::endl; - ret = rdc_dev_temp_metric_get(server_ch, dv_ind, RSMI_TEMP_TYPE_JUNCTION, - RSMI_TEMP_CURRENT, &temperature); + ret = rdc_dev_temp_metric_get(server_ch, dv_ind, RSMI_TEMP_TYPE_JUNCTION, RSMI_TEMP_CURRENT, + &temperature); CHK_RET_STATUS_CONT(ret) - std::cout << "\t GPU " << dv_ind << " has a temperature of " << - temperature << std::endl; + std::cout << "\t GPU " << dv_ind << " has a temperature of " << temperature << std::endl; } ret = rdc_channel_destroy(server_ch); CHK_RET_STATUS(ret) - std::cout << "Successfully destroyed channel to " << serv_host << ":" << - serv_port << std::endl; + std::cout << "Successfully destroyed channel to " << serv_host << ":" << serv_port << std::endl; return 0; } diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_discovery.cc b/projects/rdc/tests/rdc_tests/functional/rdci_discovery.cc index 3bfc63933c..22b3e2e17a 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_discovery.cc +++ b/projects/rdc/tests/rdc_tests/functional/rdci_discovery.cc @@ -20,26 +20,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc_tests/functional/rdci_discovery.h" + +#include #include +#include #include -#include - -#include "rdc_tests/functional/rdci_discovery.h" -#include "rdc_tests/test_common.h" #include "rdc/rdc.h" - +#include "rdc_tests/test_common.h" TestRdciDiscovery::TestRdciDiscovery() : TestBase() { set_title("\tRDC Discovery Test"); - set_description("\tThe Discovery tests verifies that the GPUs are " - "discovered and identified ."); + set_description( + "\tThe Discovery tests verifies that the GPUs are " + "discovered and identified ."); } -TestRdciDiscovery::~TestRdciDiscovery(void) { -} +TestRdciDiscovery::~TestRdciDiscovery(void) {} void TestRdciDiscovery::SetUp(void) { TestBase::SetUp(); @@ -48,9 +47,7 @@ void TestRdciDiscovery::SetUp(void) { return; } -void TestRdciDiscovery::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void TestRdciDiscovery::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void TestRdciDiscovery::DisplayResults(void) const { TestBase::DisplayResults(); @@ -61,15 +58,11 @@ void TestRdciDiscovery::Close() { TestBase::Close(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Disconnecting from host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Disconnecting from host....\n" << std::endl; } result = rdc_disconnect(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; } result = rdc_stop_embedded(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -82,54 +75,44 @@ void TestRdciDiscovery::Run(void) { TestBase::Run(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Connecting to host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Connecting to host....\n" << std::endl; } char hostIpAddress[] = {"localhost:50051"}; result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; } result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } - uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; uint32_t count = 0; - IF_VERB(STANDARD) { - std::cout << "\t**Getting the devices in the system\n" << std::endl; - } - result = rdc_device_get_all(0, gpu_index_list, &count); + IF_VERB(STANDARD) { std::cout << "\t**Getting the devices in the system\n" << std::endl; } + result = rdc_device_get_all(0, gpu_index_list, &count); ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_device_get_all(rdc_handle, gpu_index_list, 0); + result = rdc_device_get_all(rdc_handle, gpu_index_list, 0); ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); + result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); ASSERT_EQ(result, RDC_ST_OK); ASSERT_GT(count, 0); - IF_VERB(STANDARD) { - std::cout << "\t**Fetching attributes of every device\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Fetching attributes of every device\n" << std::endl; } rdc_device_attributes_t attribute; for (uint32_t i = 0; i < count; i++) { - result = rdc_device_get_attributes(0, gpu_index_list[i], &attribute); - ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); + result = rdc_device_get_attributes(0, gpu_index_list[i], &attribute); + ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], 0); - ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); + result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], 0); + ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], - &attribute); - ASSERT_EQ(result, RDC_ST_OK); + result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute); + ASSERT_EQ(result, RDC_ST_OK); - std::cout << "\tGPU ID "<< i << " || " << - attribute.device_name < +#include "rdc_tests/functional/rdci_dmon.h" + +#include #include +#include #include -#include - -#include "rdc_tests/functional/rdci_dmon.h" -#include "rdc_tests/test_common.h" #include "rdc/rdc.h" - +#include "rdc_tests/test_common.h" TestRdciDmon::TestRdciDmon() : TestBase() { set_title("\tRDC Dmon Test"); - set_description( - "\tThe Dmon tests verifies that the GPUs metrics are being monitored. "); + set_description("\tThe Dmon tests verifies that the GPUs metrics are being monitored. "); } -TestRdciDmon::~TestRdciDmon(void) { -} +TestRdciDmon::~TestRdciDmon(void) {} void TestRdciDmon::SetUp(void) { TestBase::SetUp(); @@ -48,9 +45,7 @@ void TestRdciDmon::SetUp(void) { return; } -void TestRdciDmon::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void TestRdciDmon::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void TestRdciDmon::DisplayResults(void) const { TestBase::DisplayResults(); @@ -61,15 +56,11 @@ void TestRdciDmon::Close() { TestBase::Close(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Disconnecting from host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Disconnecting from host....\n" << std::endl; } result = rdc_disconnect(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; } result = rdc_stop_embedded(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -82,16 +73,12 @@ void TestRdciDmon::Run(void) { TestBase::Run(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Connecting to host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Connecting to host....\n" << std::endl; } char hostIpAddress[] = {"localhost:50051"}; result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; } result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -99,8 +86,7 @@ void TestRdciDmon::Run(void) { rdc_group_info_t group_info; rdc_gpu_group_t group_id; rdc_field_grp_t field_group_id; - result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, - "GRP_DMON", &group_id); + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, "GRP_DMON", &group_id); ASSERT_EQ(result, RDC_ST_OK); result = rdc_group_gpu_add(rdc_handle, group_id, 0); @@ -116,11 +102,9 @@ void TestRdciDmon::Run(void) { ASSERT_EQ(result, RDC_ST_OK); ASSERT_GT(group_info.count, 0); - rdc_field_t field_ids[]= {RDC_FI_GPU_TEMP, RDC_FI_POWER_USAGE, - RDC_FI_GPU_UTIL}; - uint32_t fsize = sizeof(field_ids)/sizeof(field_ids[0]); - result = rdc_group_field_create(rdc_handle, fsize , &field_ids[0], - "FIELD_GRP", &field_group_id); + rdc_field_t field_ids[] = {RDC_FI_GPU_TEMP, RDC_FI_POWER_USAGE, RDC_FI_GPU_UTIL}; + uint32_t fsize = sizeof(field_ids) / sizeof(field_ids[0]); + result = rdc_group_field_create(rdc_handle, fsize, &field_ids[0], "FIELD_GRP", &field_group_id); ASSERT_EQ(result, RDC_ST_OK); result = rdc_field_watch(rdc_handle, -1, field_group_id, 0, 60, 10); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_dmon.h b/projects/rdc/tests/rdc_tests/functional/rdci_dmon.h index f56b094b09..b17c1eddab 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_dmon.h +++ b/projects/rdc/tests/rdc_tests/functional/rdci_dmon.h @@ -26,7 +26,7 @@ THE SOFTWARE. class TestRdciDmon : public TestBase { public: - TestRdciDmon(); + TestRdciDmon(); // @Brief: Destructor for test case of TestRdciDmon virtual ~TestRdciDmon(); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.cc b/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.cc index 9c5b720caf..46b712f803 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.cc +++ b/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.cc @@ -20,25 +20,23 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc_tests/functional/rdci_fieldgroup.h" + +#include #include +#include #include -#include - -#include "rdc_tests/functional/rdci_fieldgroup.h" -#include "rdc_tests/test_common.h" #include "rdc/rdc.h" +#include "rdc_tests/test_common.h" TestRdciFieldgroup::TestRdciFieldgroup() : TestBase() { set_title("\tRDC Fieldgroup Test"); - set_description( - "\tThe Fieldgroup tests verifies the creation/deletion of fieldgroups."); + set_description("\tThe Fieldgroup tests verifies the creation/deletion of fieldgroups."); } -TestRdciFieldgroup::~TestRdciFieldgroup(void) { -} +TestRdciFieldgroup::~TestRdciFieldgroup(void) {} void TestRdciFieldgroup::SetUp(void) { TestBase::SetUp(); @@ -47,9 +45,7 @@ void TestRdciFieldgroup::SetUp(void) { return; } -void TestRdciFieldgroup::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void TestRdciFieldgroup::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void TestRdciFieldgroup::DisplayResults(void) const { TestBase::DisplayResults(); @@ -60,15 +56,11 @@ void TestRdciFieldgroup::Close() { TestBase::Close(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Disconnecting from host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Disconnecting from host....\n" << std::endl; } result = rdc_disconnect(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; } result = rdc_stop_embedded(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -81,45 +73,38 @@ void TestRdciFieldgroup::Run(void) { TestBase::Run(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Connecting to host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Connecting to host....\n" << std::endl; } char hostIpAddress[] = {"localhost:50051"}; result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; } result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } rdc_field_grp_t field_group_id; - rdc_field_t field_ids[]= {RDC_FI_GPU_TEMP, RDC_FI_POWER_USAGE}; - rdc_field_t invalid_field_ids[]= {RDC_FI_INVALID, RDC_FI_INVALID}; - uint32_t fsize = sizeof(field_ids)/sizeof(field_ids[0]); + rdc_field_t field_ids[] = {RDC_FI_GPU_TEMP, RDC_FI_POWER_USAGE}; + rdc_field_t invalid_field_ids[] = {RDC_FI_INVALID, RDC_FI_INVALID}; + uint32_t fsize = sizeof(field_ids) / sizeof(field_ids[0]); uint32_t count = 0; rdc_field_group_info_t group_info; - result = rdc_group_field_create(rdc_handle, fsize, &invalid_field_ids[0], - "FIELD_GRP", &field_group_id); + result = rdc_group_field_create(rdc_handle, fsize, &invalid_field_ids[0], "FIELD_GRP", + &field_group_id); ASSERT_EQ(result, RDC_ST_NOT_SUPPORTED); - result = rdc_group_field_create(NULL, fsize, &field_ids[0], - "FIELD_GRP", &field_group_id); + result = rdc_group_field_create(NULL, fsize, &field_ids[0], "FIELD_GRP", &field_group_id); ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_group_field_create(rdc_handle, fsize, &field_ids[0], - NULL, NULL); + result = rdc_group_field_create(rdc_handle, fsize, &field_ids[0], NULL, NULL); ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_group_field_create(rdc_handle, - (RDC_MAX_FIELD_IDS_PER_FIELD_GROUP+1), &field_ids[0], "FIELD_GRP", NULL); + result = rdc_group_field_create(rdc_handle, (RDC_MAX_FIELD_IDS_PER_FIELD_GROUP + 1), + &field_ids[0], "FIELD_GRP", NULL); ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_group_field_create(rdc_handle, fsize, &field_ids[0], - "FIELD_GRP", &field_group_id); + result = rdc_group_field_create(rdc_handle, fsize, &field_ids[0], "FIELD_GRP", &field_group_id); ASSERT_EQ(result, RDC_ST_OK); rdc_field_grp_t group_id_list[RDC_MAX_NUM_FIELD_GROUPS]; @@ -131,14 +116,15 @@ void TestRdciFieldgroup::Run(void) { ASSERT_GT(count, 0); - std::cout << "\tGroupID\t" << "GroupName\t" << "FieldIds\n"; + std::cout << "\tGroupID\t" + << "GroupName\t" + << "FieldIds\n"; for (uint32_t i = 0; i < count; i++) { - result = rdc_group_field_get_info(rdc_handle, group_id_list[i], 0); - ASSERT_EQ(result, RDC_ST_BAD_PARAMETER); + result = rdc_group_field_get_info(rdc_handle, group_id_list[i], 0); + ASSERT_EQ(result, RDC_ST_BAD_PARAMETER); - result = rdc_group_field_get_info(rdc_handle, group_id_list[i], - &group_info); - ASSERT_EQ(result, RDC_ST_OK); + result = rdc_group_field_get_info(rdc_handle, group_id_list[i], &group_info); + ASSERT_EQ(result, RDC_ST_OK); } result = rdc_group_field_destroy(NULL, field_group_id); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.h b/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.h index ceb0fc3b6a..33cb12d7d9 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.h +++ b/projects/rdc/tests/rdc_tests/functional/rdci_fieldgroup.h @@ -26,7 +26,7 @@ THE SOFTWARE. class TestRdciFieldgroup : public TestBase { public: - TestRdciFieldgroup(); + TestRdciFieldgroup(); // @Brief: Destructor for test case of TestRdciFieldgroup virtual ~TestRdciFieldgroup(); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_group.cc b/projects/rdc/tests/rdc_tests/functional/rdci_group.cc index 432f51aba8..6a858aca38 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_group.cc +++ b/projects/rdc/tests/rdc_tests/functional/rdci_group.cc @@ -20,24 +20,23 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc_tests/functional/rdci_group.h" + +#include #include +#include #include -#include - -#include "rdc_tests/functional/rdci_group.h" -#include "rdc_tests/test_common.h" #include "rdc/rdc.h" +#include "rdc_tests/test_common.h" TestRdciGroup::TestRdciGroup() : TestBase() { set_title("\tRDC Group Test"); set_description("\tThe Group tests verifies creation/deletion of GPU groups"); } -TestRdciGroup::~TestRdciGroup(void) { -} +TestRdciGroup::~TestRdciGroup(void) {} void TestRdciGroup::SetUp(void) { TestBase::SetUp(); @@ -46,9 +45,7 @@ void TestRdciGroup::SetUp(void) { return; } -void TestRdciGroup::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void TestRdciGroup::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void TestRdciGroup::DisplayResults(void) const { TestBase::DisplayResults(); @@ -59,15 +56,11 @@ void TestRdciGroup::Close() { TestBase::Close(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Disconnecting from host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Disconnecting from host....\n" << std::endl; } result = rdc_disconnect(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; } result = rdc_stop_embedded(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -80,16 +73,12 @@ void TestRdciGroup::Run(void) { TestBase::Run(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Connecting to host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Connecting to host....\n" << std::endl; } char hostIpAddress[] = {"localhost:50051"}; result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; } result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -107,8 +96,7 @@ void TestRdciGroup::Run(void) { result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, "GRP_NAME", NULL); ASSERT_EQ(result, RDC_ST_BAD_PARAMETER); - result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, - "GRP_NAME", &group_id); + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, "GRP_NAME", &group_id); ASSERT_EQ(result, RDC_ST_OK); result = rdc_group_gpu_add(rdc_handle, group_id, -1); @@ -116,9 +104,9 @@ void TestRdciGroup::Run(void) { result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); ASSERT_EQ(result, RDC_ST_OK); - for (uint32_t i=0; i < count; i++) { - result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); - ASSERT_EQ(result, RDC_ST_OK); + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); + ASSERT_EQ(result, RDC_ST_OK); } rdc_gpu_group_t group_id_list[RDC_MAX_NUM_GROUPS]; @@ -128,23 +116,22 @@ void TestRdciGroup::Run(void) { result = rdc_group_get_all_ids(rdc_handle, group_id_list, &count); ASSERT_EQ(result, RDC_ST_OK); for (uint32_t i = 0; i < count; i++) { - result = rdc_group_gpu_get_info(rdc_handle, - group_id_list[i], NULL); - ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); - result = rdc_group_gpu_get_info(rdc_handle, - group_id_list[i], &group_info); - ASSERT_EQ(result, RDC_ST_OK); - std::cout << "\tGroupID\t" << "GroupName\t" << "GPUIndex\n"; - std::cout << "\t" << group_id_list[i] << "\t" - << group_info.group_name << "\t\t"; - for (uint32_t j = 0; j < group_info.count; j++) { - std::cout << group_info.entity_ids[j]; - if (j < group_info.count -1) { - std::cout << ","; - } - } - std::cout << std::endl; + result = rdc_group_gpu_get_info(rdc_handle, group_id_list[i], NULL); + ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); + result = rdc_group_gpu_get_info(rdc_handle, group_id_list[i], &group_info); + ASSERT_EQ(result, RDC_ST_OK); + std::cout << "\tGroupID\t" + << "GroupName\t" + << "GPUIndex\n"; + std::cout << "\t" << group_id_list[i] << "\t" << group_info.group_name << "\t\t"; + for (uint32_t j = 0; j < group_info.count; j++) { + std::cout << group_info.entity_ids[j]; + if (j < group_info.count - 1) { + std::cout << ","; + } } + std::cout << std::endl; + } result = rdc_group_gpu_destroy(0, group_id); ASSERT_EQ(result, RDC_ST_INVALID_HANDLER); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_group.h b/projects/rdc/tests/rdc_tests/functional/rdci_group.h index 00de9afe64..d6b2804445 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_group.h +++ b/projects/rdc/tests/rdc_tests/functional/rdci_group.h @@ -26,7 +26,7 @@ THE SOFTWARE. class TestRdciGroup : public TestBase { public: - TestRdciGroup(); + TestRdciGroup(); // @Brief: Destructor for test case of TestRdciGroup virtual ~TestRdciGroup(); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_stats.cc b/projects/rdc/tests/rdc_tests/functional/rdci_stats.cc index b3ffe96b1a..ad366fcb9c 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_stats.cc +++ b/projects/rdc/tests/rdc_tests/functional/rdci_stats.cc @@ -20,26 +20,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc_tests/functional/rdci_stats.h" + +#include #include +#include #include -#include - -#include "rdc_tests/functional/rdci_stats.h" -#include "rdc_tests/test_common.h" #include "rdc/rdc.h" - +#include "rdc_tests/test_common.h" TestRdciStats::TestRdciStats() : TestBase() { set_title("\tRDC Stats Test"); - set_description("\tThe Stats tests collects and verifies job " - "statistics running on gpu groups."); + set_description( + "\tThe Stats tests collects and verifies job " + "statistics running on gpu groups."); } -TestRdciStats::~TestRdciStats(void) { -} +TestRdciStats::~TestRdciStats(void) {} void TestRdciStats::SetUp(void) { TestBase::SetUp(); @@ -48,9 +47,7 @@ void TestRdciStats::SetUp(void) { return; } -void TestRdciStats::DisplayTestInfo(void) { - TestBase::DisplayTestInfo(); -} +void TestRdciStats::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); } void TestRdciStats::DisplayResults(void) const { TestBase::DisplayResults(); @@ -61,15 +58,11 @@ void TestRdciStats::Close() { TestBase::Close(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Disconnecting from host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Disconnecting from host....\n" << std::endl; } result = rdc_disconnect(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Stopping Embedded RDC Engine....\n" << std::endl; } result = rdc_stop_embedded(rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } @@ -82,24 +75,18 @@ void TestRdciStats::Run(void) { TestBase::Run(); rdc_status_t result; if (standalone_) { - IF_VERB(STANDARD) { - std::cout << "\t**Connecting to host....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Connecting to host....\n" << std::endl; } char hostIpAddress[] = {"localhost:50051"}; - result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, - nullptr, nullptr); + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); ASSERT_EQ(result, RDC_ST_OK); } else { - IF_VERB(STANDARD) { - std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Starting embedded RDC engine....\n" << std::endl; } result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); ASSERT_EQ(result, RDC_ST_OK); } rdc_gpu_group_t group_id; - result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, - "GRP_NAME", &group_id); + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, "GRP_NAME", &group_id); ASSERT_EQ(result, RDC_ST_OK); result = rdc_group_gpu_add(rdc_handle, group_id, 0); @@ -107,10 +94,10 @@ void TestRdciStats::Run(void) { result = rdc_job_start_stats(rdc_handle, group_id, "0", 1000000); if (result == RDC_ST_NOT_SUPPORTED) { - std::cout << - "\t** GPU Metric is not supported" - " on this machine" << std::endl; - return; + std::cout << "\t** GPU Metric is not supported" + " on this machine" + << std::endl; + return; } ASSERT_EQ(result, RDC_ST_OK); @@ -121,7 +108,8 @@ void TestRdciStats::Run(void) { result = rdc_job_get_stats(rdc_handle, "0", &job_info); if (result == RDC_ST_NOT_SUPPORTED) { std::cout << "\t** GPU Metric is not supported" - " on this machine" << std::endl; + " on this machine" + << std::endl; return; } ASSERT_EQ(result, RDC_ST_OK); diff --git a/projects/rdc/tests/rdc_tests/functional/rdci_stats.h b/projects/rdc/tests/rdc_tests/functional/rdci_stats.h index ae60371bc6..c0de2c0993 100644 --- a/projects/rdc/tests/rdc_tests/functional/rdci_stats.h +++ b/projects/rdc/tests/rdc_tests/functional/rdci_stats.h @@ -26,7 +26,7 @@ THE SOFTWARE. class TestRdciStats : public TestBase { public: - TestRdciStats(); + TestRdciStats(); // @Brief: Destructor for test case of TestRdciStats virtual ~TestRdciStats(); diff --git a/projects/rdc/tests/rdc_tests/main.cc b/projects/rdc/tests/rdc_tests/main.cc old mode 100755 new mode 100644 index 20399df7e4..ce415cb70d --- a/projects/rdc/tests/rdc_tests/main.cc +++ b/projects/rdc/tests/rdc_tests/main.cc @@ -21,29 +21,27 @@ THE SOFTWARE. */ #include - -#include -#include -#include -#include -#include - #include -#include "rdc/rdc.h" -#include "rocm_smi/rocm_smi.h" -#include "rdc_tests/test_common.h" -#include "rdc_tests/test_base.h" +#include +#include +#include +#include +#include #include "functional/rdci_discovery.h" -#include "functional/rdci_group.h" #include "functional/rdci_dmon.h" #include "functional/rdci_fieldgroup.h" +#include "functional/rdci_group.h" #include "functional/rdci_stats.h" +#include "rdc/rdc.h" +#include "rdc_tests/test_base.h" +#include "rdc_tests/test_common.h" +#include "rocm_smi/rocm_smi.h" -static RDCTstGlobals *sRDCGlvalues = nullptr; +static RDCTstGlobals* sRDCGlvalues = nullptr; -static void SetFlags(TestBase *test) { +static void SetFlags(TestBase* test) { assert(sRDCGlvalues != nullptr); test->set_verbosity(sRDCGlvalues->verbosity); @@ -54,7 +52,7 @@ static void SetFlags(TestBase *test) { test->set_mode(sRDCGlvalues->standalone); } -static void RunCustomTestProlog(TestBase *test) { +static void RunCustomTestProlog(TestBase* test) { SetFlags(test); test->DisplayTestInfo(); @@ -62,7 +60,7 @@ static void RunCustomTestProlog(TestBase *test) { test->Run(); return; } -static void RunCustomTestEpilog(TestBase *test) { +static void RunCustomTestEpilog(TestBase* test) { test->DisplayResults(); test->Close(); return; @@ -74,7 +72,7 @@ static void RunCustomTestEpilog(TestBase *test) { // * RunCustomTestProlog(test) // Run() should contain minimal code // * // * RunCustomTestEpilog(test) -static void RunGenericTest(TestBase *test) { +static void RunGenericTest(TestBase* test) { RunCustomTestProlog(test); RunCustomTestEpilog(test); return; @@ -108,14 +106,13 @@ TEST(rdctstReadOnly, TestRdciStats) { static int getPIDFromName(std::string name) { int pid = -1; - DIR *dir_ptr = opendir("/proc"); + DIR* dir_ptr = opendir("/proc"); if (dir_ptr != NULL) { - struct dirent *dentry; + struct dirent* dentry; while (pid < 0 && (dentry = readdir(dir_ptr))) { int id = atoi(dentry->d_name); if (id > 0) { - std::string commPath = std::string("/proc/") + - dentry->d_name + "/comm"; + std::string commPath = std::string("/proc/") + dentry->d_name + "/comm"; std::ifstream cmdFile(commPath.c_str()); std::string cmdLine; getline(cmdFile, cmdLine); @@ -172,13 +169,13 @@ static int killRDCD(int pid = 0) { return 0; } -static int startRDCD(std::string *rdcd_path, char *envp[]) { +static int startRDCD(std::string* rdcd_path, char* envp[]) { assert(rdcd_path != nullptr); - const char *rdcd_cl[128] = {rdcd_path->c_str(), "-u", NULL}; + const char* rdcd_cl[128] = {rdcd_path->c_str(), "-u", NULL}; int pid = fork(); if (pid == 0) { - if (-1 == execve(rdcd_cl[0], (char **)rdcd_cl , envp)) { // NOLINT + if (-1 == execve(rdcd_cl[0], (char**)rdcd_cl, envp)) { // NOLINT std::string err_msg = "ERROR: Child process failed to start "; err_msg += *rdcd_path; perror(err_msg.c_str()); @@ -220,13 +217,13 @@ int main(int argc, char** argv, char* envp[]) { std::cout << "0 - Embedded mode \n"; std::cout << "1 - Standalone mode \n"; while (!(std::cin >> settings.standalone)) { - std::cout << "Invalid input.\n"; - std::cin.clear(); - std::cin.ignore(); + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); } std::cout << std::endl; - std::cout << (settings.standalone? - "Standalone mode selected.\n":"Embedded mode selected.\n"); + std::cout << (settings.standalone ? "Standalone mode selected.\n" + : "Embedded mode selected.\n"); } sRDCGlvalues = &settings; @@ -271,9 +268,10 @@ int main(int argc, char** argv, char* envp[]) { } } else { if (getPIDFromName("rdcd") == -1) { - std::cout << - "rdcd is not running. Use -d (--start_rdcd) to have rdcd started." - " Exiting test." << std::endl; + std::cout << "rdcd is not running. Use -d (--start_rdcd) to have " + "rdcd started." + " Exiting test." + << std::endl; return 1; } } diff --git a/projects/rdc/tests/rdc_tests/test_base.cc b/projects/rdc/tests/rdc_tests/test_base.cc old mode 100755 new mode 100644 index 0d14386f62..de52e13e50 --- a/projects/rdc/tests/rdc_tests/test_base.cc +++ b/projects/rdc/tests/rdc_tests/test_base.cc @@ -19,13 +19,13 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include "rdc_tests/test_base.h" +#include #include -#include "rocm_smi/rocm_smi.h" -#include "rdc_tests/test_base.h" #include "rdc_tests/test_common.h" +#include "rocm_smi/rocm_smi.h" static const int kOutputLineLength = 80; static const char kLabelDelimiter[] = "####"; @@ -39,12 +39,10 @@ rdc_status_t result; /*TestBase::TestBase() : description_(""), rdc_channel_(0) { }*/ -TestBase::TestBase() : description_("") { -} -TestBase::~TestBase() { -} +TestBase::TestBase() : description_("") {} +TestBase::~TestBase() {} -static void MakeHeaderStr(const char *inStr, std::string *outStr) { +static void MakeHeaderStr(const char* inStr, std::string* outStr) { assert(outStr != nullptr); assert(inStr != nullptr); @@ -65,25 +63,18 @@ void TestBase::SetUp(void) { } void TestBase::PrintDeviceHeader(uint32_t dv_ind) { - IF_VERB(STANDARD) { - std::cout << "\t**Device index: " << dv_ind << std::endl; - } + IF_VERB(STANDARD) { std::cout << "\t**Device index: " << dv_ind << std::endl; } std::cout << std::setbase(10); } -rdc_status_t -TestBase::AllocateRDCChannel(void) { - - IF_VERB(STANDARD) { - std::cout << "\t**Initializing RDC" << std::endl; - } +rdc_status_t TestBase::AllocateRDCChannel(void) { + IF_VERB(STANDARD) { std::cout << "\t**Initializing RDC" << std::endl; } rdc_status_t result = rdc_init(0); if (result != RDC_ST_OK) { - std::cout << "Error initializing RDC.... " << - rdc_status_string(result) << std::endl; - return result; - } + std::cout << "Error initializing RDC.... " << rdc_status_string(result) << std::endl; + return result; + } return result; } @@ -109,8 +100,9 @@ void TestBase::DisplayResults(void) const { } void TestBase::DisplayTestInfo(void) { - printf("#########################################" - "######################################\n"); + printf( + "#########################################" + "######################################\n"); std::string label; MakeHeaderStr(kTitleLabel, &label); @@ -134,4 +126,3 @@ void TestBase::set_description(std::string d) { i = endlptr; } } - diff --git a/projects/rdc/tests/rdc_tests/test_base.h b/projects/rdc/tests/rdc_tests/test_base.h old mode 100755 new mode 100644 index 6b6745da31..6db5beb192 --- a/projects/rdc/tests/rdc_tests/test_base.h +++ b/projects/rdc/tests/rdc_tests/test_base.h @@ -23,6 +23,7 @@ THE SOFTWARE. #define TESTS_RDC_TESTS_TEST_BASE_H_ #include + #include "rdc/rdc.h" class TestBase { @@ -31,7 +32,7 @@ class TestBase { virtual ~TestBase(void); - enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS}; + enum VerboseLevel { VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS }; // @Brief: Before run the core measure codes, do something to set up // i.e. init runtime, prepare packet... @@ -49,88 +50,58 @@ class TestBase { // @Brief: Display information about the test virtual void DisplayTestInfo(void); - const std::string & description(void) const {return description_;} + const std::string& description(void) const { return description_; } void set_description(std::string d); - void set_title(std::string name) { - title_ = name; - } - std::string title(void) const { - return title_; - } - void set_verbosity(uint32_t v) { - verbosity_ = v; - } - uint32_t verbosity(void) const { - return verbosity_; - } - void set_dont_fail(bool f) { - dont_fail_ = f; - } - bool dont_fail(void) const { - return dont_fail_; - } - void set_num_monitor_devs(uint32_t i) { - num_monitor_devs_ = i; - } - uint32_t num_monitor_devs(void) const { - return num_monitor_devs_; - } - void set_monitor_server_ip(std::string ip) { - monitor_server_ip_ = ip; - } - std::string monitor_server_ip(void) const { - return monitor_server_ip_; - } - void set_monitor_server_port(std::string port) { - monitor_server_port_ = port; - } - std::string monitor_server_port(void) const { - return monitor_server_port_; - } - void set_secure(bool sec) { - secure_ = sec; - } - bool secure(void) const { - return secure_; - } - void set_mode(bool standalone) { - standalone_ = standalone; - } + void set_title(std::string name) { title_ = name; } + std::string title(void) const { return title_; } + void set_verbosity(uint32_t v) { verbosity_ = v; } + uint32_t verbosity(void) const { return verbosity_; } + void set_dont_fail(bool f) { dont_fail_ = f; } + bool dont_fail(void) const { return dont_fail_; } + void set_num_monitor_devs(uint32_t i) { num_monitor_devs_ = i; } + uint32_t num_monitor_devs(void) const { return num_monitor_devs_; } + void set_monitor_server_ip(std::string ip) { monitor_server_ip_ = ip; } + std::string monitor_server_ip(void) const { return monitor_server_ip_; } + void set_monitor_server_port(std::string port) { monitor_server_port_ = port; } + std::string monitor_server_port(void) const { return monitor_server_port_; } + void set_secure(bool sec) { secure_ = sec; } + bool secure(void) const { return secure_; } + void set_mode(bool standalone) { standalone_ = standalone; } rdc_handle_t rdc_handle; protected: void PrintDeviceHeader(uint32_t dv_ind); rdc_status_t AllocateRDCChannel(void); - bool standalone_; + bool standalone_; private: uint64_t num_monitor_devs_; ///< Number of monitor devices found std::string description_; std::string title_; ///< Displayed title of test - uint32_t verbosity_; ///< How much additional output to produce - bool dont_fail_; ///< Don't quit test on individual failure if true + uint32_t verbosity_; ///< How much additional output to produce + bool dont_fail_; ///< Don't quit test on individual failure if true std::string monitor_server_ip_; std::string monitor_server_port_; bool secure_; // Use authenticated comms. (SSL/TSL) - }; #define IF_VERB(VB) if (verbosity() && verbosity() >= (TestBase::VERBOSE_##VB)) // Macros to be used within TestBase classes -#define CHK_ERR_ASRT(RET) { \ - if (dont_fail() && ((RET) != RDC_STATUS_SUCCESS)) { \ - std::cout << std::endl << "\t===> TEST FAILURE." << std::endl; \ - DISPLAY_RDC_ERR(RET); \ - std::cout << \ - "\t===> Abort is over-ridden due to dont_fail command line option." \ - << std::endl; \ - return; \ - } else { \ - ASSERT_EQ(RDC_STATUS_SUCCESS, (RET)); \ - } \ -} +#define CHK_ERR_ASRT(RET) \ + { \ + if (dont_fail() && ((RET) != RDC_STATUS_SUCCESS)) { \ + std::cout << std::endl << "\t===> TEST FAILURE." << std::endl; \ + DISPLAY_RDC_ERR(RET); \ + std::cout << "\t===> Abort is over-ridden due to dont_fail command " \ + "line option." \ + << std::endl; \ + return; \ + } else { \ + ASSERT_EQ(RDC_STATUS_SUCCESS, (RET)); \ + } \ + } #endif // TESTS_RDC_TESTS_TEST_BASE_H_ diff --git a/projects/rdc/tests/rdc_tests/test_common.cc b/projects/rdc/tests/rdc_tests/test_common.cc old mode 100755 new mode 100644 index 2e39ba41e7..58b5fd2dda --- a/projects/rdc/tests/rdc_tests/test_common.cc +++ b/projects/rdc/tests/rdc_tests/test_common.cc @@ -20,19 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "rdc_tests/test_common.h" + #include -#include #include +#include #include -#include #include +#include #include "rdc_tests/test_base.h" -#include "rdc_tests/test_common.h" #include "rocm_smi/rocm_smi.h" -/*static const std::map kGRPCChanState = { +/*static const std::map kGRPCChanState = +{ { {GRPC_CHANNEL_IDLE, "GRPC_CHANNEL_IDLE: Channel is idle"}, {GRPC_CHANNEL_CONNECTING, @@ -45,110 +47,89 @@ THE SOFTWARE. }, }; */ -static const std::map kBlockNameMap = { - {RSMI_GPU_BLOCK_UMC, "UMC"}, - {RSMI_GPU_BLOCK_SDMA, "SDMA"}, - {RSMI_GPU_BLOCK_GFX, "GFX"}, - {RSMI_GPU_BLOCK_MMHUB, "MMHUB"}, - {RSMI_GPU_BLOCK_ATHUB, "ATHUB"}, - {RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"}, - {RSMI_GPU_BLOCK_HDP, "HDP"}, - {RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"}, - {RSMI_GPU_BLOCK_DF, "DF"}, - {RSMI_GPU_BLOCK_SMN, "SMN"}, - {RSMI_GPU_BLOCK_SEM, "SEM"}, - {RSMI_GPU_BLOCK_MP0, "MP0"}, - {RSMI_GPU_BLOCK_MP1, "MP1"}, - {RSMI_GPU_BLOCK_FUSE, "FUSE"}, +static const std::map kBlockNameMap = { + {RSMI_GPU_BLOCK_UMC, "UMC"}, {RSMI_GPU_BLOCK_SDMA, "SDMA"}, + {RSMI_GPU_BLOCK_GFX, "GFX"}, {RSMI_GPU_BLOCK_MMHUB, "MMHUB"}, + {RSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"}, + {RSMI_GPU_BLOCK_HDP, "HDP"}, {RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"}, + {RSMI_GPU_BLOCK_DF, "DF"}, {RSMI_GPU_BLOCK_SMN, "SMN"}, + {RSMI_GPU_BLOCK_SEM, "SEM"}, {RSMI_GPU_BLOCK_MP0, "MP0"}, + {RSMI_GPU_BLOCK_MP1, "MP1"}, {RSMI_GPU_BLOCK_FUSE, "FUSE"}, }; -static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE, - "kBlockNameMap needs to be updated"); +static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated"); -static const char * kRasErrStateStrings[] = { - "None", // RSMI_RAS_ERR_STATE_NONE - "Disabled", // RSMI_RAS_ERR_STATE_DISABLED - "Error Unknown", // RSMI_RAS_ERR_STATE_PARITY - "Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C - "Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC - "Poison" // RSMI_RAS_ERR_STATE_POISON - "Off", // RSMI_RAS_ERR_STATE_DISABLED - "On", // RSMI_RAS_ERR_STATE_ENABLED +static const char* kRasErrStateStrings[] = { + "None", // RSMI_RAS_ERR_STATE_NONE + "Disabled", // RSMI_RAS_ERR_STATE_DISABLED + "Error Unknown", // RSMI_RAS_ERR_STATE_PARITY + "Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C + "Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC + "Poison" // RSMI_RAS_ERR_STATE_POISON + "Off", // RSMI_RAS_ERR_STATE_DISABLED + "On", // RSMI_RAS_ERR_STATE_ENABLED }; -static_assert( - sizeof(kRasErrStateStrings)/sizeof(char *) == (RSMI_RAS_ERR_STATE_LAST + 1), - "kErrStateNameMap needs to be updated"); +static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (RSMI_RAS_ERR_STATE_LAST + 1), + "kErrStateNameMap needs to be updated"); - -static const std::map kErrStateNameMap = { - {RSMI_RAS_ERR_STATE_NONE, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_NONE]}, - {RSMI_RAS_ERR_STATE_DISABLED, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_DISABLED]}, - {RSMI_RAS_ERR_STATE_PARITY, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_PARITY]}, - {RSMI_RAS_ERR_STATE_SING_C, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_SING_C]}, - {RSMI_RAS_ERR_STATE_MULT_UC, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_MULT_UC]}, - {RSMI_RAS_ERR_STATE_POISON, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_POISON]}, - {RSMI_RAS_ERR_STATE_ENABLED, - kRasErrStateStrings[RSMI_RAS_ERR_STATE_ENABLED]}, +static const std::map kErrStateNameMap = { + {RSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[RSMI_RAS_ERR_STATE_NONE]}, + {RSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_DISABLED]}, + {RSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[RSMI_RAS_ERR_STATE_PARITY]}, + {RSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[RSMI_RAS_ERR_STATE_SING_C]}, + {RSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[RSMI_RAS_ERR_STATE_MULT_UC]}, + {RSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[RSMI_RAS_ERR_STATE_POISON]}, + {RSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_ENABLED]}, }; static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED, - "kErrStateNameMap needs to be updated"); + "kErrStateNameMap needs to be updated"); static const struct option long_options[] = { - {"iterations", required_argument, nullptr, 'i'}, - {"verbose", required_argument, nullptr, 'v'}, - {"monitor_verbose", required_argument, nullptr, 'm'}, - {"remote_server_ip", required_argument, nullptr, 's'}, - {"remote_server_port", required_argument, nullptr, 'p'}, - {"start_rdcd", optional_argument, nullptr, 'd'}, - {"batch_mode", no_argument, nullptr, 'b'}, - {"dont_fail", no_argument, nullptr, 'f'}, - {"unauth_comm", no_argument, nullptr, 'u'}, - {"rdctst_help", no_argument, nullptr, 'r'}, + {"iterations", required_argument, nullptr, 'i'}, + {"verbose", required_argument, nullptr, 'v'}, + {"monitor_verbose", required_argument, nullptr, 'm'}, + {"remote_server_ip", required_argument, nullptr, 's'}, + {"remote_server_port", required_argument, nullptr, 'p'}, + {"start_rdcd", optional_argument, nullptr, 'd'}, + {"batch_mode", no_argument, nullptr, 'b'}, + {"dont_fail", no_argument, nullptr, 'f'}, + {"unauth_comm", no_argument, nullptr, 'u'}, + {"rdctst_help", no_argument, nullptr, 'r'}, - {nullptr, 0, nullptr, 0} -}; + {nullptr, 0, nullptr, 0}}; static const char* short_options = "i:v:m:s:p:d:bfur"; static void PrintHelp(void) { - std::cout << - "Optional rdctst Arguments:\n" - "--batch_mode, -b run in embedded mode with no interactive prompts\n" - "--dont_fail, -f if set, don't fail test when individual test fails; " - "default is to fail when an individual test fails\n" - "--rdctst_help, -r print this help message\n" - "--verbosity, -v \n" - " Verbosity levels:\n" - " 0 -- minimal; just summary information\n" - " 1 -- intermediate; show intermediate values such as intermediate " - "perf. data\n" - " 2 -- progress; show progress displays\n" - " >= 3 -- more debug output\n" - "--start_rdcd ; start default version of rdcd, or " - "optionally specified rdcd\n" - "--remote_server_ip ; connect to already running " - "rdcd on specified IP\n" - "--remote_server_port ; connect to already running " - "rdcd on specified IP at this port\n" - "--unauth_comm; don't use TSL/SSL authentication; " - "default is with authentication\n"; + std::cout << "Optional rdctst Arguments:\n" + "--batch_mode, -b run in embedded mode with no interactive prompts\n" + "--dont_fail, -f if set, don't fail test when individual test fails; " + "default is to fail when an individual test fails\n" + "--rdctst_help, -r print this help message\n" + "--verbosity, -v \n" + " Verbosity levels:\n" + " 0 -- minimal; just summary information\n" + " 1 -- intermediate; show intermediate values such as " + "intermediate " + "perf. data\n" + " 2 -- progress; show progress displays\n" + " >= 3 -- more debug output\n" + "--start_rdcd ; start default version of rdcd, or " + "optionally specified rdcd\n" + "--remote_server_ip ; connect to already running " + "rdcd on specified IP\n" + "--remote_server_port ; connect to already running " + "rdcd on specified IP at this port\n" + "--unauth_comm; don't use TSL/SSL authentication; " + "default is with authentication\n"; } -static bool CheckArgs(RDCTstGlobals *test) { +static bool CheckArgs(RDCTstGlobals* test) { if (test->batch_mode) { - if ( - (test->monitor_server_ip != "") || - (test->monitor_server_port != "") || - test->secure) { + if ((test->monitor_server_ip != "") || (test->monitor_server_port != "") || test->secure) { std::cout << "--batch_mode option is incompatible with " - "--remote_server_ip, --remote_server_port and --unauth_comm" << - std::endl; + "--remote_server_ip, --remote_server_port and --unauth_comm" + << std::endl; return false; - } + } } return true; } @@ -193,7 +174,7 @@ uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list) { case 'p': test->monitor_server_port = optarg; - break; + break; case 'r': PrintHelp(); @@ -213,8 +194,7 @@ uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list) { break; default: - std::cout << "Unknown command line option: \"" << - arg_list[arg_ind] << "\"" << std::endl; + std::cout << "Unknown command line option: \"" << arg_list[arg_ind] << "\"" << std::endl; PrintHelp(); return 1; } @@ -227,31 +207,32 @@ uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list) { return 1; } -const char *GetBlockNameStr(rsmi_gpu_block_t id) { - return kBlockNameMap.at(id); -} -const char *GetErrStateNameStr(rsmi_ras_err_state_t st) { - return kErrStateNameMap.at(st); -} +const char* GetBlockNameStr(rsmi_gpu_block_t id) { return kBlockNameMap.at(id); } +const char* GetErrStateNameStr(rsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); } /*const char *GetGRPCChanStateStr(grpc_connectivity_state st) { return kGRPCChanState.at(st); }*/ -const char *FreqEnumToStr(rsmi_clk_type rsmi_clk) { - static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM, - "FreqEnumToStr() needs to be updated"); +const char* FreqEnumToStr(rsmi_clk_type rsmi_clk) { + static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM, "FreqEnumToStr() needs to be updated"); switch (rsmi_clk) { - case RSMI_CLK_TYPE_SYS: return "System clock"; - case RSMI_CLK_TYPE_DF: return "Data Fabric clock"; - case RSMI_CLK_TYPE_DCEF: return "Display Controller Engine clock"; - case RSMI_CLK_TYPE_SOC: return "SOC clock"; - case RSMI_CLK_TYPE_MEM: return "Memory clock"; - default: return "Invalid Clock ID"; + case RSMI_CLK_TYPE_SYS: + return "System clock"; + case RSMI_CLK_TYPE_DF: + return "Data Fabric clock"; + case RSMI_CLK_TYPE_DCEF: + return "Display Controller Engine clock"; + case RSMI_CLK_TYPE_SOC: + return "SOC clock"; + case RSMI_CLK_TYPE_MEM: + return "Memory clock"; + default: + return "Invalid Clock ID"; } } #if ENABLE_SMI -void DumpMonitorInfo(const TestBase *test) { +void DumpMonitorInfo(const TestBase* test) { int ret = 0; uint32_t value; uint32_t value2; @@ -259,16 +240,14 @@ void DumpMonitorInfo(const TestBase *test) { std::vector val_vec; assert(test != nullptr); - assert(test->monitor_devices() != nullptr && - "Make sure to call test->set_monitor_devices()"); - auto print_attr_label = - [&](std::string attrib) -> bool { - std::cout << "\t** " << attrib; - if (ret == -1) { - std::cout << "not available" << std::endl; - return false; - } - return true; + assert(test->monitor_devices() != nullptr && "Make sure to call test->set_monitor_devices()"); + auto print_attr_label = [&](std::string attrib) -> bool { + std::cout << "\t** " << attrib; + if (ret == -1) { + std::cout << "not available" << std::endl; + return false; + } + return true; }; auto delim = "\t***********************************"; @@ -277,8 +256,7 @@ void DumpMonitorInfo(const TestBase *test) { std::cout << delim << std::endl; std::cout.setf(std::ios::dec, std::ios::basefield); for (auto dev : *test->monitor_devices()) { - auto print_vector = - [&](amd::smi::DevInfoTypes type, std::string label) { + auto print_vector = [&](amd::smi::DevInfoTypes type, std::string label) { ret = dev->readDevInfo(type, &val_vec); if (print_attr_label(label)) { for (auto vs : val_vec) { @@ -287,8 +265,7 @@ void DumpMonitorInfo(const TestBase *test) { val_vec.clear(); } }; - auto print_val_str = - [&](amd::smi::DevInfoTypes type, std::string label) { + auto print_val_str = [&](amd::smi::DevInfoTypes type, std::string label) { ret = dev->readDevInfo(type, &val_str); std::cout << "\t** " << label; @@ -297,16 +274,14 @@ void DumpMonitorInfo(const TestBase *test) { } else { std::cout << val_str; } - std::cout << std:: endl; + std::cout << std::endl; }; print_val_str(amd::smi::kDevDevID, "Device ID: "); print_val_str(amd::smi::kDevPerfLevel, "Performance Level: "); print_val_str(amd::smi::kDevOverDriveLevel, "OverDrive Level: "); - print_vector(amd::smi::kDevGPUMClk, - "Supported GPU Memory clock frequencies:\n"); - print_vector(amd::smi::kDevGPUSClk, - "Supported GPU clock frequencies:\n"); + print_vector(amd::smi::kDevGPUMClk, "Supported GPU Memory clock frequencies:\n"); + print_vector(amd::smi::kDevGPUSClk, "Supported GPU clock frequencies:\n"); if (dev->monitor() != nullptr) { ret = dev->monitor()->readMonitor(amd::smi::kMonName, &val_str); @@ -316,7 +291,7 @@ void DumpMonitorInfo(const TestBase *test) { ret = dev->monitor()->readMonitor(amd::smi::kMonTemp, &value); if (print_attr_label("Temperature: ")) { - std::cout << static_cast(value)/1000.0 << "C" << std::endl; + std::cout << static_cast(value) / 1000.0 << "C" << std::endl; } std::cout.setf(std::ios::dec, std::ios::basefield); @@ -326,8 +301,8 @@ void DumpMonitorInfo(const TestBase *test) { ret = dev->monitor()->readMonitor(amd::smi::kMonFanSpeed, &value2); } if (print_attr_label("Current Fan Speed: ")) { - std::cout << value2/static_cast(value) * 100 << "% (" << - value2 << "/" << value << ")" << std::endl; + std::cout << value2 / static_cast(value) * 100 << "% (" << value2 << "/" << value + << ")" << std::endl; } } std::cout << "\t=======" << std::endl; diff --git a/projects/rdc/tests/rdc_tests/test_common.h b/projects/rdc/tests/rdc_tests/test_common.h old mode 100755 new mode 100644 index a6cd5b1804..065fdfc9ec --- a/projects/rdc/tests/rdc_tests/test_common.h +++ b/projects/rdc/tests/rdc_tests/test_common.h @@ -24,8 +24,8 @@ THE SOFTWARE. #define TESTS_RDC_TESTS_TEST_COMMON_H_ #include -#include #include +#include #include "rocm_smi/rocm_smi.h" @@ -45,38 +45,40 @@ struct RDCTstGlobals { uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list); void PrintTestHeader(uint32_t dv_ind); -const char *GetBlockNameStr(rsmi_gpu_block_t id); -const char *GetErrStateNameStr(rsmi_ras_err_state_t st); -//const char *GetGRPCChanStateStr(grpc_connectivity_state st); -const char *FreqEnumToStr(rsmi_clk_type rsmi_clk); +const char* GetBlockNameStr(rsmi_gpu_block_t id); +const char* GetErrStateNameStr(rsmi_ras_err_state_t st); +// const char *GetGRPCChanStateStr(grpc_connectivity_state st); +const char* FreqEnumToStr(rsmi_clk_type rsmi_clk); #if ENABLE_SMI -void DumpMonitorInfo(const TestBase *test); +void DumpMonitorInfo(const TestBase* test); #endif -#define DISPLAY_RDC_ERR(RET) { \ - if (RET != RDC_STATUS_SUCCESS) { \ - const char *err_str; \ - std::cout << "\t===> ERROR: RDC call returned " << (RET) << std::endl; \ - rdc_status_string((RET), &err_str); \ - std::cout << "\t===> (" << err_str << ")" << std::endl; \ - std::cout << "\t===> at " << __FILE__ << ":" << std::dec << __LINE__ << \ - std::endl; \ - } \ -} +#define DISPLAY_RDC_ERR(RET) \ + { \ + if (RET != RDC_STATUS_SUCCESS) { \ + const char* err_str; \ + std::cout << "\t===> ERROR: RDC call returned " << (RET) << std::endl; \ + rdc_status_string((RET), &err_str); \ + std::cout << "\t===> (" << err_str << ")" << std::endl; \ + std::cout << "\t===> at " << __FILE__ << ":" << std::dec << __LINE__ << std::endl; \ + } \ + } -#define CHK_ERR_RET(RET) { \ - DISPLAY_RDC_ERR(RET) \ - if ((RET) != RDC_STATUS_SUCCESS) { \ - return (RET); \ - } \ -} -#define CHK_RDC_PERM_ERR(RET) { \ - if (RET == RDC_STATUS_PERMISSION) { \ +#define CHK_ERR_RET(RET) \ + { \ + DISPLAY_RDC_ERR(RET) \ + if ((RET) != RDC_STATUS_SUCCESS) { \ + return (RET); \ + } \ + } +#define CHK_RDC_PERM_ERR(RET) \ + { \ + if (RET == RDC_STATUS_PERMISSION) { \ std::cout << "This command requires root access." << std::endl; \ - } else { \ - DISPLAY_RDC_ERR(RET) \ - } \ -} + } else { \ + DISPLAY_RDC_ERR(RET) \ + } \ + } #endif // TESTS_RDC_TESTS_TEST_COMMON_H_ diff --git a/projects/rdc/tests/rdc_tests/test_utils.cc b/projects/rdc/tests/rdc_tests/test_utils.cc old mode 100755 new mode 100644 index 41ae08126c..0bf6ddd5cd --- a/projects/rdc/tests/rdc_tests/test_utils.cc +++ b/projects/rdc/tests/rdc_tests/test_utils.cc @@ -43,12 +43,13 @@ * */ +#include "rdc_tests/test_utils.h" + #include #include "rocm_smi/rocm_smi.h" -#include "rdc_tests/test_utils.h" -static const std::map kDevFWNameMap = { +static const std::map kDevFWNameMap = { {RSMI_FW_BLOCK_ASD, "asd"}, {RSMI_FW_BLOCK_CE, "ce"}, {RSMI_FW_BLOCK_DMCU, "dmcu"}, @@ -72,8 +73,4 @@ static const std::map kDevFWNameMap = { {RSMI_FW_BLOCK_VCN, "vcn"}, }; - -const char * -NameFromFWEnum(rsmi_fw_block_t blk) { - return kDevFWNameMap.at(blk); -} +const char* NameFromFWEnum(rsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); } diff --git a/projects/rdc/tests/rdc_tests/test_utils.h b/projects/rdc/tests/rdc_tests/test_utils.h old mode 100755 new mode 100644 index 9b4e92c589..6fc0ef2a0d --- a/projects/rdc/tests/rdc_tests/test_utils.h +++ b/projects/rdc/tests/rdc_tests/test_utils.h @@ -48,7 +48,6 @@ #include "rocm_smi/rocm_smi.h" -const char * -NameFromFWEnum(rsmi_fw_block_t blk); +const char* NameFromFWEnum(rsmi_fw_block_t blk); #endif // TESTS_RDC_TESTS_TEST_UTILS_H_