Files
rocm-systems/client/include/rdc/rdc_client.h
T
Galantsev, Dmitrii 434e40305d LINT: Add cpplint, clang-format and pre-commit support
Change-Id: I3cbb787ef27d90486b212dfb1a8c77c460acc2ac
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
2024-01-09 11:37:11 -06:00

383 lines
16 KiB
C++

/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
#include <grpcpp/grpcpp.h>
#include <memory>
#include <string>
#include "rocm_smi/rocm_smi.h"
/**
* @brief Error codes retured by rdc functions
*/
typedef enum {
RDC_STATUS_SUCCESS = 0x0, //!< Operation was successful
RDC_RSMI_STATUS_INVALID_ARGS, //!< Passed in arguments are not valid
RDC_RSMI_STATUS_NOT_SUPPORTED, //!< The requested information or
//!< action is not available for the
//!< given input, on the given system
RDC_RSMI_STATUS_FILE_ERROR, //!< Problem accessing a file. This
//!< may because the operation is not
//!< supported by the Linux kernel
//!< version running on the executing
//!< machine
RDC_RSMI_STATUS_PERMISSION, //!< Permission denied/EACCESS file
//!< error. Many functions require
//!< root access to run.
RDC_RSMI_STATUS_OUT_OF_RESOURCES, //!< Unable to acquire memory or other
//!< resource
RDC_RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught
RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of
//!< allowable or safe range
RDC_RSMI_STATUS_INIT_ERROR, //!< An error occurred when creating
//!< a communications channel
RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED, //!< The requested function has not
//!< yet been implemented in the
//!< current system for the current
//!< devices
RDC_RSMI_STATUS_NOT_FOUND, //!< An item was searched for but not
//!< found
RDC_RSMI_STATUS_INSUFFICIENT_SIZE, //!< Not enough resources were
//!< available for the operation
RDC_RSMI_STATUS_INTERRUPT, //!< An interrupt occurred during
//!< execution of function
RDC_RSMI_STATUS_UNEXPECTED_SIZE, //!< An unexpected amount of data
//!< was read
RDC_RSMI_STATUS_NO_DATA, //!< No data was found for a given
//!< input
RDC_RSMI_STATUS_UNKNOWN_ERROR, //!< An unknown error occurred
RDC_STATUS_GRPC_ERR_FIRST = 1000,
/// Not an error; returned on success.
RDC_STATUS_GRPC_OK = RDC_STATUS_GRPC_ERR_FIRST,
/// The operation was cancelled (typically by the caller).
RDC_STATUS_GRPC_CANCELLED,
/// Unknown error. An example of where this error may be returned is if a
/// Status value received from another address space belongs to an error-space
/// that is not known in this address space. Also errors raised by APIs that
/// do not return enough error information may be converted to this error.
RDC_STATUS_GRPC_UNKNOWN,
/// Client specified an invalid argument. Note that this differs from
/// FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are
/// problematic regardless of the state of the system (e.g., a malformed file
/// name).
RDC_STATUS_GRPC_INVALID_ARG,
/// Deadline expired before operation could complete. For operations that
/// change the state of the system, this error may be returned even if the
/// operation has completed successfully. For example, a successful response
/// from a server could have been delayed long enough for the deadline to
/// expire.
RDC_STATUS_GRPC_DEADLINE_EXCEEDED,
/// Some requested entity (e.g., file or directory) was not found.
RDC_STATUS_GRPC_NOT_FOUND,
/// Some entity that we attempted to create (e.g., file or directory) already
/// exists.
RDC_STATUS_GRPC_ALREADY_EXISTS,
/// The caller does not have permission to execute the specified operation.
/// PERMISSION_DENIED must not be used for rejections caused by exhausting
/// some resource (use RESOURCE_EXHAUSTED instead for those errors).
/// PERMISSION_DENIED must not be used if the caller can not be identified
/// (use UNAUTHENTICATED instead for those errors).
RDC_STATUS_GRPC_PERM_DENIED,
/// The request does not have valid authentication credentials for the
/// operation.
RDC_STATUS_GRPC_UNAUTHENTICATED,
/// Some resource has been exhausted, perhaps a per-user quota, or perhaps the
/// entire file system is out of space.
RDC_STATUS_GRPC_RESOURCE_EXHAUSTED,
/// Operation was rejected because the system is not in a state required for
/// the operation's execution. For example, directory to be deleted may be
/// non-empty, an rmdir operation is applied to a non-directory, etc.
///
/// A litmus test that may help a service implementor in deciding
/// between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:
/// (a) Use UNAVAILABLE if the client can retry just the failing call.
/// (b) Use ABORTED if the client should retry at a higher-level
/// (e.g., restarting a read-modify-write sequence).
/// (c) Use FAILED_PRECONDITION if the client should not retry until
/// the system state has been explicitly fixed. E.g., if an "rmdir"
/// fails because the directory is non-empty, FAILED_PRECONDITION
/// should be returned since the client should not retry unless
/// they have first fixed up the directory by deleting files from it.
/// (d) Use FAILED_PRECONDITION if the client performs conditional
/// REST Get/Update/Delete on a resource and the resource on the
/// server does not match the condition. E.g., conflicting
/// read-modify-write on the same resource.
RDC_STATUS_GRPC_FAILED_PRECOND,
/// The operation was aborted, typically due to a concurrency issue like
/// sequencer check failures, transaction aborts, etc.
///
/// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED,
/// and UNAVAILABLE.
RDC_STATUS_GRPC_ABORTED,
/// Operation was attempted past the valid range. E.g., seeking or reading
/// past end of file.
///
/// Unlike INVALID_ARGUMENT, this error indicates a problem that may be fixed
/// if the system state changes. For example, a 32-bit file system will
/// generate INVALID_ARGUMENT if asked to read at an offset that is not in the
/// range [0,2^32-1], but it will generate OUT_OF_RANGE if asked to read from
/// an offset past the current file size.
///
/// There is a fair bit of overlap between FAILED_PRECONDITION and
/// OUT_OF_RANGE. We recommend using OUT_OF_RANGE (the more specific error)
/// when it applies so that callers who are iterating through a space can
/// easily look for an OUT_OF_RANGE error to detect when they are done.
RDC_STATUS_GRPC_OUT_OF_RANGE,
/// Operation is not implemented or not supported/enabled in this service.
RDC_STATUS_GRPC_UNIMPLEMENTED,
/// Internal errors. Means some invariants expected by underlying System has
/// been broken. If you see one of these errors, Something is very broken.
RDC_STATUS_GRPC_INTERNAL,
/// The service is currently unavailable. This is a most likely a transient
/// condition and may be corrected by retrying with a backoff.
///
/// \warning Although data MIGHT not have been transmitted when this
/// status occurs, there is NOT A GUARANTEE that the server has not seen
/// anything. So in general it is unsafe to retry on this status code
/// if the call is non-idempotent.
///
/// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED,
/// and UNAVAILABLE.
RDC_STATUS_GRPC_UNAVAILABLE,
/// Unrecoverable data loss or corruption.
RDC_STATUS_GRPC_DATA_LOSS,
RDC_STATUS_CLIENT_ERR_FIRST = 2000,
/// SSL authentication error occurred.
RDC_STATUS_CLIENT_ERR_SSL = RDC_STATUS_CLIENT_ERR_FIRST,
RDC_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
} rdc_status_t;
/**
* @brief Handle to RDC server channel
*/
typedef uintptr_t rdc_channel_t;
#define RDC_DEFAULT_SERVER_PORT 50051
#define RDC_DEFAULT_SERVER_IP "localhost"
/*****************************************************************************/
/** @defgroup RDCAdmin RDC Administration Functions
* These administrative functions are used to monitor and control, for
* example RDC connectivity.
* @{
*/
/**
* @brief Check the connection status of a channel
*
* @details Given an ::rdc_channel_t @p channel and a boolean @p
* try_to_connect, this function will return the grpc_connectivity_state for
* that channel
*
* @p channel[in] The channel for which the status will be given
*
* @param[in] try_to_connect If the channel is currently IDLE, if the argument
* is true, transition to CONNECTING.
*
* @param[inout] state A pointer to caller provided memory to which an
* the grpc_connectivity_state will be written. grpc_connectivity_state has
* the following possible values:
* GRPC_CHANNEL_IDLE channel is idle
* GRPC_CHANNEL_CONNECTING channel is connecting
* GRPC_CHANNEL_READY channel is ready for work
* GRPC_CHANNEL_TRANSIENT_FAILURE channel has seen a failure but expects to
* recover
* GRPC_CHANNEL_SHUTDOWN channel has seen a failure that it cannot
* recover from
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state* state);
/**
* @brief Verify a channel's connection to the server
*
* @details Given an ::rdc_channel_t @p channel, this function will send a
* random number to the server associated with @p channel. The server will send
* the number back. Upon receiving the returned message from the server, the
* number sent to the server is compared to the number received from the
* server. If the 2 numbers are the same, the connection is verified.
* Otherwise, an appropriate error code is returned.
*
* @p channel[in] The channel for which the connection will be verified
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel);
/** @} */ // end of RDCAdmin
/*****************************************************************************/
/** @defgroup InitShutAdmin Initialization and Shutdown
* These functions are used for initialization of RDC and clean up when
* done.
* @{
*/
/**
* @brief Create a communications channel to an RDC server
*
* @details Given a pointer to an ::rdc_channel_t @p channel, a string
* containing the ip address of the server @p ip, a string containing
* the port number on which the server is listening @p port and a bool
* indicating whether the channel should use a secure link @p secure,
* this function will attempt to create a new channel and write its
* location to address pointed to by @p channel.
*
* @p channel[inout] A pointer to caller provided memory to which an
* ::rdc_channel_t will be written
*
* @param[in] ip A pointer to a string containing the address of the server.
* If nullptr is passed for this parameter, RDC_DEFAULT_SERVER_IP will be used.
*
* @param[in] port A pointer to string containing the port on which the
* RDC server is listening. If nullptr is passed for this parameter,
* RDC_DEFAULT_SERVER_PORT will be used.
*
* @param[in] secure A bool indicating whether SSL should be used for
* communications (not currently supported)
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
bool secure);
/**
* @brief Destroy a communications channel to an RDC server
*
* @details Given an ::rdc_channel_t @p channel, this function will free any
* resources used by @p channel
*
* @p channel[inout] An ::rdc_channel_t will be freed
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_destroy(rdc_channel_t channel);
/** @} */ // end of InitShutAdmin
/*****************************************************************************/
/** @defgroup RSMIAccess Remote ROCm SMI Calls
* These functions calls make ROCm SMI function calls on the remote server.
* Please refer to the
* [ROCm SMI documentation]
* (https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/docs) for
* information about the calls. Here, we will document any additional aspects
* of the calls introduced by RDC that are not covered in the ROCm SMI
* documentation.
*
* All of the functions in this section attempt to make an RSMI call on the
* server machine, given an ::rdc_channel_t associated with the server, and
* all the arguments that are required to make the RSMI call.
* @{
*/
/**
* @brief Remote call to rsmi_num_monitor_devices()
*
*/
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu);
/** @} */ // end of RSMIAccess
/** @defgroup PhysQuer Physical State Queries
* These functions provide information about the physical characteristics of
* the device.
* @{
*/
/**
* @brief Remote call to rsmi_dev_temp_metric_get()
*
*/
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t* temperature);
/**
* @brief Remote call to rsmi_dev_fan_rpms_get()
*
*/
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* rpms);
/**
* @brief Remote call to rsmi_dev_fan_speed_get()
*
*/
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* speed);
/**
* @brief Remote call to rsmi_dev_fan_speed_max_get()
*
*/
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
uint64_t* max_speed);
/** @} */ // end of PhysQuer
/**
* @brief Get a description of a provided RDC error status
*
* @details Set the provided pointer to a const char *, @p status_string, to
* a string containing a description of the provided error code @p status.
*
* @param[in] status The error status for which a description is desired
*
* @param[inout] status_string A pointer to a const char * which will be made
* to point to a description of the provided error code
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
*
*/
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string);
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_H_