Files
rocm-systems/projects/rdc/protos/rdc.proto
T
Galantsev, Dmitrii 38c60ff90b RVS: Finish initial RVS integration
NOTE: RVS Build is disabled by default due to CI build issues.

Change-Id: I1593f0fe22075a9f86f54afa3ac151e109f1f7bd
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: eaa1862a80]
2024-01-10 00:27:04 -06:00

547 lines
14 KiB
Protocol Buffer
Executable File

// Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
syntax = "proto3";
// option java_multiple_files = true;
// option java_package = "io.grpc.examples.helloworld";
// option java_outer_classname = "HelloWorldProto";
// option objc_class_prefix = "HLW";
package rdc;
/****************************************************************************/
/********************************** Rsmi Service ****************************/
/****************************************************************************/
service Rsmi {
// RSMI ID services
rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {}
// RSMI Physical Queries
rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){}
rpc GetFanRpms(GetFanRpmsRequest) returns(GetFanRpmsResponse){}
rpc GetFanSpeed(GetFanSpeedRequest) returns(GetFanSpeedResponse){}
rpc GetFanSpeedMax(GetFanSpeedMaxRequest) returns(GetFanSpeedMaxResponse){}
}
/* rsmi_num_monitor_devices() */
message GetNumDevicesRequest {
}
message GetNumDevicesResponse {
uint64 val = 1;
uint64 ret_val = 2;
}
/* GetTemperature */
/* rsmi_dev_temp_metric_get() */
message GetTemperatureRequest {
uint32 dv_ind = 1;
uint32 sensor_type = 2;
enum TemperatureMetric {
RSMI_TEMP_CURRENT = 0;
RSMI_TEMP_MAX = 1;
RSMI_TEMP_MIN = 2;
RSMI_TEMP_MAX_HYST = 3;
RSMI_TEMP_MIN_HYST = 4;
RSMI_TEMP_CRITICAL = 5;
RSMI_TEMP_CRITICAL_HYST = 6;
RSMI_TEMP_EMERGENCY = 7;
RSMI_TEMP_EMERGENCY_HYST = 8;
RSMI_TEMP_CRIT_MIN = 9;
RSMI_TEMP_CRIT_MIN_HYST = 10;
RSMI_TEMP_OFFSET = 11;
RSMI_TEMP_LOWEST = 12;
RSMI_TEMP_HIGHEST = 13;
}
TemperatureMetric metric = 3;
}
message GetTemperatureResponse {
int64 temperature = 1;
uint64 ret_val = 2;
}
/* GetFanRpms */
/* rsmi_dev_fan_rpms_get() */
message GetFanRpmsRequest {
uint32 dv_ind = 1;
uint32 sensor_ind = 2;
}
message GetFanRpmsResponse {
int64 rpms = 1;
uint64 ret_val = 2;
}
/* GetFanSpeed */
/* rsmi_dev_fan_speed_get() */
message GetFanSpeedRequest {
uint32 dv_ind = 1;
uint32 sensor_ind = 2;
}
message GetFanSpeedResponse {
int64 speed = 1;
uint64 ret_val = 2;
}
/* GetFanSpeedMax */
/* rsmi_dev_fan_speed_max_get() */
message GetFanSpeedMaxRequest {
uint32 dv_ind = 1;
uint32 sensor_ind = 2;
}
message GetFanSpeedMaxResponse {
uint64 max_speed = 1;
uint64 ret_val = 2;
}
/****************************************************************************/
/********************************** RdcAdmin Service ************************/
/****************************************************************************/
service RdcAdmin {
// RDC admin services
rpc VerifyConnection (VerifyConnectionRequest)
returns (VerifyConnectionResponse) {}
}
/* GetNumDevices */
message VerifyConnectionRequest {
uint64 magic_num = 1;
}
message VerifyConnectionResponse {
uint64 echo_magic_num = 1;
}
/****************************************************************************/
/********************************** RdcAPI Service ************************/
/****************************************************************************/
service RdcAPI {
// Discovery API
// rdc_status_t rdc_get_all_devices(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count)
rpc GetAllDevices(Empty) returns (GetAllDevicesResponse) {}
// rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr)
rpc GetDeviceAttributes(GetDeviceAttributesRequest) returns (GetDeviceAttributesResponse) {}
// Group API
// rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
// const char* group_name, rdc_gpu_group_t* p_rdc_group_id)
rpc CreateGpuGroup(CreateGpuGroupRequest) returns (CreateGpuGroupResponse) {}
// rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId,
// uint32_t gpu_index)
rpc AddToGpuGroup(AddToGpuGroupRequest) returns (AddToGpuGroupResponse) {}
// rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
// uint32_t* field_ids, const char* field_group_name,
// rdc_field_grp_t* rdc_field_group_id)
rpc CreateFieldGroup(CreateFieldGroupRequest) returns (CreateFieldGroupResponse) {}
// rdc_status_t rdc_group_field_get_info(
// rdc_field_grp_t rdc_field_group_id,
// rdc_field_group_info_t* field_group_info)
rpc GetFieldGroupInfo(GetFieldGroupInfoRequest) returns (GetFieldGroupInfoResponse) {}
// rdc_status_t rdc_group_gpu_get_info(
// rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info)
rpc GetGpuGroupInfo(GetGpuGroupInfoRequest) returns (GetGpuGroupInfoResponse) {}
// rdc_status_t rdc_group_gpu_destroy(
// rdc_gpu_group_t p_rdc_group_id)
rpc DestroyGpuGroup(DestroyGpuGroupRequest) returns (DestroyGpuGroupResponse) {}
// rdc_status_t rdc_group_field_destroy(
// rdc_field_grp_t rdc_field_group_id)
rpc DestroyFieldGroup(DestroyFieldGroupRequest) returns (DestroyFieldGroupResponse) {}
// Field API
// rdc_status_t rdc_watch_fields(rdc_gpu_group_t group_id,
// rdc_field_grp_t field_group_id, uint64_t update_freq,
// double max_keep_age, uint32_t max_keep_samples)
rpc WatchFields(WatchFieldsRequest) returns (WatchFieldsResponse) {}
// rdc_status_t rdc_get_latest_value_for_field(uint32_t gpu_index,
// uint32_t field, rdc_field_value* value)
rpc GetLatestFieldValue(GetLatestFieldValueRequest) returns (GetLatestFieldValueResponse) {}
// rdc_status_t rdc_get_field_value_since(uint32_t gpu_index,
// uint32_t field, uint64_t since_time_stamp,
// uint64_t *next_since_time_stamp, rdc_field_value* value)
rpc GetFieldSince(GetFieldSinceRequest) returns (GetFieldSinceResponse) {}
// rdc_status_t rdc_unwatch_fields(rdc_gpu_group_t group_id,
// rdc_field_grp_t field_group_id)
rpc UnWatchFields(UnWatchFieldsRequest) returns (UnWatchFieldsResponse) {}
// rdc_status_t rdc_update_all_fields(uint32_t wait_for_update)
rpc UpdateAllFields(UpdateAllFieldsRequest) returns (UpdateAllFieldsResponse) {}
// rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count)
rpc GetGroupAllIds(Empty) returns (GetGroupAllIdsResponse) {}
// rdc_status_t rdc_group_field_all_ids(rdc_field_grp_t field_group_id_list[], uint32_t* count)
rpc GetFieldGroupAllIds(Empty) returns (GetFieldGroupAllIdsResponse) {}
// JOB API
// rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
// char job_id[64], uint64_t update_freq)
rpc StartJobStats(StartJobStatsRequest) returns (StartJobStatsResponse) {}
// rdc_status_t rdc_job_get_stats(char jobId[64],
// rdc_job_info_t* p_job_info)
rpc GetJobStats(GetJobStatsRequest) returns (GetJobStatsResponse) {}
// rdc_status_t rdc_job_stop_stats(char job_id[64])
rpc StopJobStats(StopJobStatsRequest) returns (StopJobStatsResponse) {}
// rdc_status_t rdc_job_remove(char job_id[64])
rpc RemoveJob(RemoveJobRequest) returns (RemoveJobResponse) {}
// rdc_status_t rdc_job_remove_all()
rpc RemoveAllJob(Empty) returns (RemoveAllJobResponse) {}
// rdc_status_t rdc_diagnostic_run(
// rdc_gpu_group_t group_id,
// rdc_diag_level_t level,
// const char* config,
// size_t config_size,
// rdc_diag_response_t* response);
rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {}
// rdc_status_t rdc_test_case_run(
// rdc_gpu_group_t group_id,
// rdc_diag_test_cases_t test_case,
// const char* config,
// size_t config_size,
// rdc_diag_test_result_t* result);
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {}
}
message Empty {
}
message GetAllDevicesResponse {
uint32 status = 1;
repeated uint32 gpus = 2;
}
message GetDeviceAttributesRequest {
uint32 gpu_index = 1;
}
message DeviceAttributes {
string device_name = 1;
}
message GetDeviceAttributesResponse {
uint32 status = 1;
DeviceAttributes attributes = 2;
}
message CreateGpuGroupRequest {
enum GpuGroupType {
RDC_GROUP_DEFAULT = 0;
RDC_GROUP_EMPTY = 1;
}
GpuGroupType type = 1;
string group_name = 2;
}
message CreateGpuGroupResponse {
uint32 status = 1;
uint32 group_id = 2;
}
message AddToGpuGroupRequest {
uint32 group_id = 1;
uint32 gpu_index = 2;
}
message AddToGpuGroupResponse {
uint32 status = 1;
}
message CreateFieldGroupRequest {
repeated uint32 field_ids = 1;
string field_group_name = 2;
}
message CreateFieldGroupResponse {
uint32 status = 1;
uint32 field_group_id = 2;
}
message GetFieldGroupInfoRequest {
uint32 field_group_id = 1;
}
message GetFieldGroupInfoResponse {
uint32 status = 1;
string filed_group_name = 2;
repeated uint32 field_ids = 3;
}
message GetGpuGroupInfoRequest {
uint32 group_id = 1;
}
message GetGpuGroupInfoResponse {
uint32 status = 1;
string group_name = 2;
repeated uint32 entity_ids = 3;
}
message DestroyGpuGroupRequest {
uint32 group_id = 1;
}
message DestroyGpuGroupResponse {
uint32 status = 1;
}
message DestroyFieldGroupRequest {
uint32 field_group_id = 1;
}
message DestroyFieldGroupResponse {
uint32 status = 1;
}
message WatchFieldsRequest {
uint32 group_id = 1;
uint32 field_group_id = 2;
uint64 update_freq = 3;
double max_keep_age = 4;
uint32 max_keep_samples = 5;
}
message WatchFieldsResponse {
uint32 status = 1;
}
message GetLatestFieldValueRequest {
uint32 gpu_index = 1;
uint32 field_id = 2;
}
message GetLatestFieldValueResponse {
uint32 status = 1;
uint32 field_id = 2;
uint32 rdc_status = 3;
uint64 ts = 4;
enum FieldType {
INTEGER = 0;
DOUBLE = 1;
STRING = 2;
BLOB = 3;
};
FieldType type = 5;
oneof value {
uint64 l_int = 6;
double dbl = 7;
string str = 8;
}
}
message GetFieldSinceRequest {
uint32 gpu_index = 1;
uint32 field_id = 2;
uint64 since_time_stamp = 3;
}
message GetFieldSinceResponse {
uint32 status = 1;
uint64 next_since_time_stamp = 2;
uint32 field_id = 3;
uint32 rdc_status = 4;
uint64 ts = 5;
enum FieldType {
INTEGER = 0;
DOUBLE = 1;
STRING = 2;
BLOB = 3;
};
FieldType type = 6;
oneof value {
uint64 l_int = 7;
double dbl = 8;
string str = 9;
}
}
message UnWatchFieldsRequest {
uint32 group_id = 1;
uint32 field_group_id = 2;
}
message UnWatchFieldsResponse {
uint32 status = 1;
}
message UpdateAllFieldsRequest {
uint32 wait_for_update = 1;
}
message UpdateAllFieldsResponse {
uint32 status = 1;
}
message GetGroupAllIdsResponse {
uint32 status = 1;
repeated uint32 group_ids = 2;
}
message GetFieldGroupAllIdsResponse {
uint32 status = 1;
repeated uint32 field_group_ids = 2;
}
message StartJobStatsRequest {
uint32 group_id = 1;
string job_id = 2;
uint64 update_freq = 3;
}
message StartJobStatsResponse {
uint32 status = 1;
}
message GetJobStatsRequest {
string job_id = 1;
}
message JobStatsSummary {
uint64 max_value = 1;
uint64 min_value = 2;
uint64 average = 3;
double standard_deviation = 4;
}
message GpuUsageInfo {
uint32 gpu_id = 1;
uint64 start_time = 2;
uint64 end_time = 3;
uint64 energy_consumed = 4;
JobStatsSummary power_usage = 5;
JobStatsSummary gpu_clock = 6;
JobStatsSummary gpu_utilization = 7;
uint64 max_gpu_memory_used = 8;
JobStatsSummary memory_utilization = 9;
uint64 ecc_correct = 10;
uint64 ecc_uncorrect = 11;
JobStatsSummary pcie_tx = 12;
JobStatsSummary pcie_rx = 13;
JobStatsSummary memory_clock = 14;
JobStatsSummary gpu_temperature = 15;
}
message GetJobStatsResponse {
uint32 status = 1;
uint32 num_gpus = 2;
GpuUsageInfo summary = 3;
repeated GpuUsageInfo gpus = 4;
}
message StopJobStatsRequest {
string job_id = 1;
}
message StopJobStatsResponse {
uint32 status = 1;
}
message RemoveJobRequest {
string job_id = 1;
}
message RemoveJobResponse {
uint32 status = 1;
}
message RemoveAllJobResponse {
uint32 status = 1;
}
message DiagnosticRunRequest {
uint32 group_id = 1;
uint32 level = 2;
string config = 3;
uint32 config_size = 4;
}
message DiagnosticDetail {
string msg = 1;
uint32 code = 2;
}
message DiagnosticPerGpuResult {
uint32 gpu_index = 1;
DiagnosticDetail gpu_result = 2;
}
message DiagnosticTestResult {
uint32 status = 1;
DiagnosticDetail details = 2;
enum DiagnosticTestCase {
COMPUTE_PROCESS = 0;
SDMA_QUEUE = 1;
COMPUTE_QUEUE = 2;
VRAM_CHECK = 3;
SYS_MEM_CHECK = 4;
NODE_TOPOLOGY = 5;
RVS_TEST = 6;
GPU_PARAMETERS = 7;
};
DiagnosticTestCase test_case = 3;
uint32 per_gpu_result_count = 4;
repeated DiagnosticPerGpuResult gpu_results = 5;
string info = 6;
}
message DiagnosticResponse {
uint32 results_count = 1;
repeated DiagnosticTestResult diag_info = 2;
}
message DiagnosticRunResponse {
uint32 status = 1;
DiagnosticResponse response = 2;
}
message DiagnosticTestCaseRunRequest {
uint32 group_id = 1;
enum TestCaseType {
COMPUTE_PROCESS = 0;
SDMA_QUEUE = 1;
COMPUTE_QUEUE = 2;
VRAM_CHECK = 3;
SYS_MEM_CHECK = 4;
NODE_TOPOLOGY = 5;
RVS_TEST = 6;
GPU_PARAMETERS = 7;
};
TestCaseType test_case = 2;
string config = 3;
uint32 config_size = 4;
}
message DiagnosticTestCaseRunResponse {
uint32 status = 1;
DiagnosticTestResult result = 2;
}