38c60ff90b
NOTE: RVS Build is disabled by default due to CI build issues.
Change-Id: I1593f0fe22075a9f86f54afa3ac151e109f1f7bd
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: eaa1862a80]
547 lines
14 KiB
Protocol Buffer
Executable File
547 lines
14 KiB
Protocol Buffer
Executable File
|
|
// Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
syntax = "proto3";
|
|
|
|
// option java_multiple_files = true;
|
|
// option java_package = "io.grpc.examples.helloworld";
|
|
// option java_outer_classname = "HelloWorldProto";
|
|
// option objc_class_prefix = "HLW";
|
|
|
|
package rdc;
|
|
|
|
/****************************************************************************/
|
|
/********************************** Rsmi Service ****************************/
|
|
/****************************************************************************/
|
|
service Rsmi {
|
|
// RSMI ID services
|
|
rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {}
|
|
|
|
// RSMI Physical Queries
|
|
rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){}
|
|
rpc GetFanRpms(GetFanRpmsRequest) returns(GetFanRpmsResponse){}
|
|
rpc GetFanSpeed(GetFanSpeedRequest) returns(GetFanSpeedResponse){}
|
|
rpc GetFanSpeedMax(GetFanSpeedMaxRequest) returns(GetFanSpeedMaxResponse){}
|
|
}
|
|
|
|
/* rsmi_num_monitor_devices() */
|
|
message GetNumDevicesRequest {
|
|
}
|
|
message GetNumDevicesResponse {
|
|
uint64 val = 1;
|
|
uint64 ret_val = 2;
|
|
}
|
|
|
|
/* GetTemperature */
|
|
/* rsmi_dev_temp_metric_get() */
|
|
message GetTemperatureRequest {
|
|
uint32 dv_ind = 1;
|
|
uint32 sensor_type = 2;
|
|
enum TemperatureMetric {
|
|
RSMI_TEMP_CURRENT = 0;
|
|
RSMI_TEMP_MAX = 1;
|
|
RSMI_TEMP_MIN = 2;
|
|
RSMI_TEMP_MAX_HYST = 3;
|
|
RSMI_TEMP_MIN_HYST = 4;
|
|
RSMI_TEMP_CRITICAL = 5;
|
|
RSMI_TEMP_CRITICAL_HYST = 6;
|
|
RSMI_TEMP_EMERGENCY = 7;
|
|
RSMI_TEMP_EMERGENCY_HYST = 8;
|
|
RSMI_TEMP_CRIT_MIN = 9;
|
|
RSMI_TEMP_CRIT_MIN_HYST = 10;
|
|
RSMI_TEMP_OFFSET = 11;
|
|
RSMI_TEMP_LOWEST = 12;
|
|
RSMI_TEMP_HIGHEST = 13;
|
|
}
|
|
TemperatureMetric metric = 3;
|
|
}
|
|
message GetTemperatureResponse {
|
|
int64 temperature = 1;
|
|
uint64 ret_val = 2;
|
|
}
|
|
|
|
/* GetFanRpms */
|
|
/* rsmi_dev_fan_rpms_get() */
|
|
message GetFanRpmsRequest {
|
|
uint32 dv_ind = 1;
|
|
uint32 sensor_ind = 2;
|
|
}
|
|
message GetFanRpmsResponse {
|
|
int64 rpms = 1;
|
|
uint64 ret_val = 2;
|
|
}
|
|
/* GetFanSpeed */
|
|
/* rsmi_dev_fan_speed_get() */
|
|
message GetFanSpeedRequest {
|
|
uint32 dv_ind = 1;
|
|
uint32 sensor_ind = 2;
|
|
}
|
|
message GetFanSpeedResponse {
|
|
int64 speed = 1;
|
|
uint64 ret_val = 2;
|
|
}
|
|
|
|
/* GetFanSpeedMax */
|
|
/* rsmi_dev_fan_speed_max_get() */
|
|
message GetFanSpeedMaxRequest {
|
|
uint32 dv_ind = 1;
|
|
uint32 sensor_ind = 2;
|
|
}
|
|
message GetFanSpeedMaxResponse {
|
|
uint64 max_speed = 1;
|
|
uint64 ret_val = 2;
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/********************************** RdcAdmin Service ************************/
|
|
/****************************************************************************/
|
|
service RdcAdmin {
|
|
// RDC admin services
|
|
rpc VerifyConnection (VerifyConnectionRequest)
|
|
returns (VerifyConnectionResponse) {}
|
|
}
|
|
|
|
/* GetNumDevices */
|
|
message VerifyConnectionRequest {
|
|
uint64 magic_num = 1;
|
|
}
|
|
message VerifyConnectionResponse {
|
|
uint64 echo_magic_num = 1;
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/********************************** RdcAPI Service ************************/
|
|
/****************************************************************************/
|
|
|
|
service RdcAPI {
|
|
// Discovery API
|
|
// rdc_status_t rdc_get_all_devices(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count)
|
|
rpc GetAllDevices(Empty) returns (GetAllDevicesResponse) {}
|
|
// rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr)
|
|
rpc GetDeviceAttributes(GetDeviceAttributesRequest) returns (GetDeviceAttributesResponse) {}
|
|
|
|
// Group API
|
|
// rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
|
|
// const char* group_name, rdc_gpu_group_t* p_rdc_group_id)
|
|
rpc CreateGpuGroup(CreateGpuGroupRequest) returns (CreateGpuGroupResponse) {}
|
|
|
|
// rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId,
|
|
// uint32_t gpu_index)
|
|
rpc AddToGpuGroup(AddToGpuGroupRequest) returns (AddToGpuGroupResponse) {}
|
|
|
|
// rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
|
|
// uint32_t* field_ids, const char* field_group_name,
|
|
// rdc_field_grp_t* rdc_field_group_id)
|
|
rpc CreateFieldGroup(CreateFieldGroupRequest) returns (CreateFieldGroupResponse) {}
|
|
|
|
// rdc_status_t rdc_group_field_get_info(
|
|
// rdc_field_grp_t rdc_field_group_id,
|
|
// rdc_field_group_info_t* field_group_info)
|
|
rpc GetFieldGroupInfo(GetFieldGroupInfoRequest) returns (GetFieldGroupInfoResponse) {}
|
|
|
|
// rdc_status_t rdc_group_gpu_get_info(
|
|
// rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info)
|
|
rpc GetGpuGroupInfo(GetGpuGroupInfoRequest) returns (GetGpuGroupInfoResponse) {}
|
|
|
|
// rdc_status_t rdc_group_gpu_destroy(
|
|
// rdc_gpu_group_t p_rdc_group_id)
|
|
rpc DestroyGpuGroup(DestroyGpuGroupRequest) returns (DestroyGpuGroupResponse) {}
|
|
|
|
// rdc_status_t rdc_group_field_destroy(
|
|
// rdc_field_grp_t rdc_field_group_id)
|
|
rpc DestroyFieldGroup(DestroyFieldGroupRequest) returns (DestroyFieldGroupResponse) {}
|
|
|
|
// Field API
|
|
// rdc_status_t rdc_watch_fields(rdc_gpu_group_t group_id,
|
|
// rdc_field_grp_t field_group_id, uint64_t update_freq,
|
|
// double max_keep_age, uint32_t max_keep_samples)
|
|
rpc WatchFields(WatchFieldsRequest) returns (WatchFieldsResponse) {}
|
|
|
|
// rdc_status_t rdc_get_latest_value_for_field(uint32_t gpu_index,
|
|
// uint32_t field, rdc_field_value* value)
|
|
rpc GetLatestFieldValue(GetLatestFieldValueRequest) returns (GetLatestFieldValueResponse) {}
|
|
|
|
// rdc_status_t rdc_get_field_value_since(uint32_t gpu_index,
|
|
// uint32_t field, uint64_t since_time_stamp,
|
|
// uint64_t *next_since_time_stamp, rdc_field_value* value)
|
|
rpc GetFieldSince(GetFieldSinceRequest) returns (GetFieldSinceResponse) {}
|
|
|
|
// rdc_status_t rdc_unwatch_fields(rdc_gpu_group_t group_id,
|
|
// rdc_field_grp_t field_group_id)
|
|
rpc UnWatchFields(UnWatchFieldsRequest) returns (UnWatchFieldsResponse) {}
|
|
|
|
// rdc_status_t rdc_update_all_fields(uint32_t wait_for_update)
|
|
rpc UpdateAllFields(UpdateAllFieldsRequest) returns (UpdateAllFieldsResponse) {}
|
|
|
|
// rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count)
|
|
rpc GetGroupAllIds(Empty) returns (GetGroupAllIdsResponse) {}
|
|
|
|
// rdc_status_t rdc_group_field_all_ids(rdc_field_grp_t field_group_id_list[], uint32_t* count)
|
|
rpc GetFieldGroupAllIds(Empty) returns (GetFieldGroupAllIdsResponse) {}
|
|
|
|
// JOB API
|
|
// rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
|
|
// char job_id[64], uint64_t update_freq)
|
|
rpc StartJobStats(StartJobStatsRequest) returns (StartJobStatsResponse) {}
|
|
|
|
// rdc_status_t rdc_job_get_stats(char jobId[64],
|
|
// rdc_job_info_t* p_job_info)
|
|
rpc GetJobStats(GetJobStatsRequest) returns (GetJobStatsResponse) {}
|
|
|
|
// rdc_status_t rdc_job_stop_stats(char job_id[64])
|
|
rpc StopJobStats(StopJobStatsRequest) returns (StopJobStatsResponse) {}
|
|
|
|
// rdc_status_t rdc_job_remove(char job_id[64])
|
|
rpc RemoveJob(RemoveJobRequest) returns (RemoveJobResponse) {}
|
|
|
|
// rdc_status_t rdc_job_remove_all()
|
|
rpc RemoveAllJob(Empty) returns (RemoveAllJobResponse) {}
|
|
|
|
// rdc_status_t rdc_diagnostic_run(
|
|
// rdc_gpu_group_t group_id,
|
|
// rdc_diag_level_t level,
|
|
// const char* config,
|
|
// size_t config_size,
|
|
// rdc_diag_response_t* response);
|
|
rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {}
|
|
|
|
// rdc_status_t rdc_test_case_run(
|
|
// rdc_gpu_group_t group_id,
|
|
// rdc_diag_test_cases_t test_case,
|
|
// const char* config,
|
|
// size_t config_size,
|
|
// rdc_diag_test_result_t* result);
|
|
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {}
|
|
}
|
|
|
|
message Empty {
|
|
}
|
|
|
|
message GetAllDevicesResponse {
|
|
uint32 status = 1;
|
|
repeated uint32 gpus = 2;
|
|
}
|
|
|
|
message GetDeviceAttributesRequest {
|
|
uint32 gpu_index = 1;
|
|
}
|
|
|
|
message DeviceAttributes {
|
|
string device_name = 1;
|
|
}
|
|
|
|
message GetDeviceAttributesResponse {
|
|
uint32 status = 1;
|
|
DeviceAttributes attributes = 2;
|
|
}
|
|
|
|
message CreateGpuGroupRequest {
|
|
enum GpuGroupType {
|
|
RDC_GROUP_DEFAULT = 0;
|
|
RDC_GROUP_EMPTY = 1;
|
|
}
|
|
GpuGroupType type = 1;
|
|
string group_name = 2;
|
|
}
|
|
|
|
message CreateGpuGroupResponse {
|
|
uint32 status = 1;
|
|
uint32 group_id = 2;
|
|
}
|
|
|
|
message AddToGpuGroupRequest {
|
|
uint32 group_id = 1;
|
|
uint32 gpu_index = 2;
|
|
}
|
|
|
|
message AddToGpuGroupResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message CreateFieldGroupRequest {
|
|
repeated uint32 field_ids = 1;
|
|
string field_group_name = 2;
|
|
}
|
|
|
|
message CreateFieldGroupResponse {
|
|
uint32 status = 1;
|
|
uint32 field_group_id = 2;
|
|
}
|
|
|
|
message GetFieldGroupInfoRequest {
|
|
uint32 field_group_id = 1;
|
|
}
|
|
|
|
message GetFieldGroupInfoResponse {
|
|
uint32 status = 1;
|
|
string filed_group_name = 2;
|
|
repeated uint32 field_ids = 3;
|
|
}
|
|
|
|
message GetGpuGroupInfoRequest {
|
|
uint32 group_id = 1;
|
|
}
|
|
|
|
message GetGpuGroupInfoResponse {
|
|
uint32 status = 1;
|
|
string group_name = 2;
|
|
repeated uint32 entity_ids = 3;
|
|
}
|
|
|
|
message DestroyGpuGroupRequest {
|
|
uint32 group_id = 1;
|
|
}
|
|
|
|
message DestroyGpuGroupResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message DestroyFieldGroupRequest {
|
|
uint32 field_group_id = 1;
|
|
}
|
|
|
|
message DestroyFieldGroupResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message WatchFieldsRequest {
|
|
uint32 group_id = 1;
|
|
uint32 field_group_id = 2;
|
|
uint64 update_freq = 3;
|
|
double max_keep_age = 4;
|
|
uint32 max_keep_samples = 5;
|
|
}
|
|
|
|
message WatchFieldsResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message GetLatestFieldValueRequest {
|
|
uint32 gpu_index = 1;
|
|
uint32 field_id = 2;
|
|
}
|
|
|
|
message GetLatestFieldValueResponse {
|
|
uint32 status = 1;
|
|
uint32 field_id = 2;
|
|
uint32 rdc_status = 3;
|
|
uint64 ts = 4;
|
|
enum FieldType {
|
|
INTEGER = 0;
|
|
DOUBLE = 1;
|
|
STRING = 2;
|
|
BLOB = 3;
|
|
};
|
|
FieldType type = 5;
|
|
oneof value {
|
|
uint64 l_int = 6;
|
|
double dbl = 7;
|
|
string str = 8;
|
|
}
|
|
}
|
|
|
|
message GetFieldSinceRequest {
|
|
uint32 gpu_index = 1;
|
|
uint32 field_id = 2;
|
|
uint64 since_time_stamp = 3;
|
|
}
|
|
|
|
message GetFieldSinceResponse {
|
|
uint32 status = 1;
|
|
uint64 next_since_time_stamp = 2;
|
|
uint32 field_id = 3;
|
|
uint32 rdc_status = 4;
|
|
uint64 ts = 5;
|
|
enum FieldType {
|
|
INTEGER = 0;
|
|
DOUBLE = 1;
|
|
STRING = 2;
|
|
BLOB = 3;
|
|
};
|
|
FieldType type = 6;
|
|
oneof value {
|
|
uint64 l_int = 7;
|
|
double dbl = 8;
|
|
string str = 9;
|
|
}
|
|
}
|
|
|
|
message UnWatchFieldsRequest {
|
|
uint32 group_id = 1;
|
|
uint32 field_group_id = 2;
|
|
}
|
|
|
|
message UnWatchFieldsResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message UpdateAllFieldsRequest {
|
|
uint32 wait_for_update = 1;
|
|
}
|
|
|
|
message UpdateAllFieldsResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message GetGroupAllIdsResponse {
|
|
uint32 status = 1;
|
|
repeated uint32 group_ids = 2;
|
|
}
|
|
|
|
|
|
message GetFieldGroupAllIdsResponse {
|
|
uint32 status = 1;
|
|
repeated uint32 field_group_ids = 2;
|
|
}
|
|
|
|
message StartJobStatsRequest {
|
|
uint32 group_id = 1;
|
|
string job_id = 2;
|
|
uint64 update_freq = 3;
|
|
}
|
|
|
|
message StartJobStatsResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message GetJobStatsRequest {
|
|
string job_id = 1;
|
|
}
|
|
|
|
message JobStatsSummary {
|
|
uint64 max_value = 1;
|
|
uint64 min_value = 2;
|
|
uint64 average = 3;
|
|
double standard_deviation = 4;
|
|
}
|
|
|
|
message GpuUsageInfo {
|
|
uint32 gpu_id = 1;
|
|
uint64 start_time = 2;
|
|
uint64 end_time = 3;
|
|
uint64 energy_consumed = 4;
|
|
JobStatsSummary power_usage = 5;
|
|
JobStatsSummary gpu_clock = 6;
|
|
JobStatsSummary gpu_utilization = 7;
|
|
uint64 max_gpu_memory_used = 8;
|
|
JobStatsSummary memory_utilization = 9;
|
|
uint64 ecc_correct = 10;
|
|
uint64 ecc_uncorrect = 11;
|
|
JobStatsSummary pcie_tx = 12;
|
|
JobStatsSummary pcie_rx = 13;
|
|
JobStatsSummary memory_clock = 14;
|
|
JobStatsSummary gpu_temperature = 15;
|
|
}
|
|
message GetJobStatsResponse {
|
|
uint32 status = 1;
|
|
uint32 num_gpus = 2;
|
|
GpuUsageInfo summary = 3;
|
|
repeated GpuUsageInfo gpus = 4;
|
|
}
|
|
|
|
message StopJobStatsRequest {
|
|
string job_id = 1;
|
|
}
|
|
|
|
message StopJobStatsResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message RemoveJobRequest {
|
|
string job_id = 1;
|
|
}
|
|
|
|
message RemoveJobResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message RemoveAllJobResponse {
|
|
uint32 status = 1;
|
|
}
|
|
|
|
message DiagnosticRunRequest {
|
|
uint32 group_id = 1;
|
|
uint32 level = 2;
|
|
string config = 3;
|
|
uint32 config_size = 4;
|
|
}
|
|
|
|
message DiagnosticDetail {
|
|
string msg = 1;
|
|
uint32 code = 2;
|
|
}
|
|
|
|
message DiagnosticPerGpuResult {
|
|
uint32 gpu_index = 1;
|
|
DiagnosticDetail gpu_result = 2;
|
|
}
|
|
|
|
message DiagnosticTestResult {
|
|
uint32 status = 1;
|
|
DiagnosticDetail details = 2;
|
|
enum DiagnosticTestCase {
|
|
COMPUTE_PROCESS = 0;
|
|
SDMA_QUEUE = 1;
|
|
COMPUTE_QUEUE = 2;
|
|
VRAM_CHECK = 3;
|
|
SYS_MEM_CHECK = 4;
|
|
NODE_TOPOLOGY = 5;
|
|
RVS_TEST = 6;
|
|
GPU_PARAMETERS = 7;
|
|
};
|
|
DiagnosticTestCase test_case = 3;
|
|
uint32 per_gpu_result_count = 4;
|
|
repeated DiagnosticPerGpuResult gpu_results = 5;
|
|
string info = 6;
|
|
}
|
|
|
|
message DiagnosticResponse {
|
|
uint32 results_count = 1;
|
|
repeated DiagnosticTestResult diag_info = 2;
|
|
}
|
|
|
|
message DiagnosticRunResponse {
|
|
uint32 status = 1;
|
|
DiagnosticResponse response = 2;
|
|
}
|
|
|
|
message DiagnosticTestCaseRunRequest {
|
|
uint32 group_id = 1;
|
|
enum TestCaseType {
|
|
COMPUTE_PROCESS = 0;
|
|
SDMA_QUEUE = 1;
|
|
COMPUTE_QUEUE = 2;
|
|
VRAM_CHECK = 3;
|
|
SYS_MEM_CHECK = 4;
|
|
NODE_TOPOLOGY = 5;
|
|
RVS_TEST = 6;
|
|
GPU_PARAMETERS = 7;
|
|
};
|
|
TestCaseType test_case = 2;
|
|
string config = 3;
|
|
uint32 config_size = 4;
|
|
}
|
|
|
|
message DiagnosticTestCaseRunResponse {
|
|
uint32 status = 1;
|
|
DiagnosticTestResult result = 2;
|
|
}
|