Please follow the README file

Update README_rdc_rest_api.txt

Update RDC_REST_API.py

Error handling updates

Updates for error handling

Updates

Updates for rdc_field_watch/rdc_field_unwatch and delete query

Updates for rdc_field_watch/rdc_field_unwatch and delete query

SWDEV-479738 [RDC] - Rest API

Delete python_binding/RDC_REST_API.py

new rdc_rest_api.py file for SWDEV-479738 [RDC] - Rest API
Этот коммит содержится в:
AL Musaffar, Yazen
2024-12-03 15:29:35 -06:00
коммит произвёл Galantsev, Dmitrii
родитель e847f74f78
Коммит cf566ebd31
2 изменённых файлов: 234 добавлений и 0 удалений
+102
Просмотреть файл
@@ -0,0 +1,102 @@
# RDC REST API
## Overview
This REST API provides functionalities to:
- Discover available GPUs on a node.
- Configure and manage GPU monitoring queries.
- Retrieve GPU metrics based on configured queries.
The API is built using Flask and interacts with the RDC library to monitor GPU usage and performance metrics.
## Installation
### Prerequisites
- Python 3.x
- Flask
- RDC Library (`librdc_bootstrap.so` must be available and accessible)
### Install Dependencies
```sh
pip install flask
```
## Running the API
1. Ensure `librdc_bootstrap.so` is in the library path:
```sh
export LD_LIBRARY_PATH=/path/to/librdc_bootstrap.so:$LD_LIBRARY_PATH
```
2. Run the API:
```sh
python rdc_rest_api.py
```
The API will start and listen on `http://0.0.0.0:50052`.
## API Endpoints
### 1. Discover GPUs
**GET** `/rdc/discovery`
#### Response:
```json
{
"0": "GPU Name",
"1": "GPU Name"
}
```
### 2. Create Query Criteria
**POST** `/rdc/query_criteria`
#### Request Body:
```json
{
"gpu_index": [0,1],
"metrics": ["RDC_FI_GPU_CLOCK", "RDC_FI_GPU_TEMP"]
}
```
#### Response:
```json
{
"query_id": "G-1-F-2"
}
```
### 3. Get Query Criteria
**GET** `/rdc/query_criteria/<query_id>`
#### Response:
```json
{
"gpu_index": [0,1],
"metrics": ["RDC_FI_GPU_CLOCK", "RDC_FI_GPU_TEMP"],
"query_id": "G-1-F-2"
}
```
### 4. Delete Query Criteria
**DELETE** `/rdc/query_criteria/<query_id>`
#### Response:
```json
{
"message": "Deleted successfully"
}
```
### 5. Retrieve GPU Metrics
**GET** `/rdc/gpu_metrics/<query_id>`
#### Response:
```json
[
{
"gpu_index": 0,
"RDC_FI_GPU_CLOCK": 1450,
"RDC_FI_GPU_TEMP": 32
},
{
"gpu_index": 1,
"RDC_FI_GPU_CLOCK": 736,
"RDC_FI_GPU_TEMP": 35
}
]
```
## Notes
- Ensure `librdc_bootstrap.so` is properly linked.
- The API should be run on a system with RDC installed and GPUs accessible.
+132
Просмотреть файл
@@ -0,0 +1,132 @@
from flask import Flask, request, jsonify
from RdcReader import RdcReader
from RdcUtil import RdcUtil
from rdc_bootstrap import *
# Initialize Flask app
app = Flask(__name__)
# Initialize RDC Reader and Utilities for handling GPU queries
rdc_reader = RdcReader(ip_port=None)
rdc_util = RdcUtil()
# Dictionary to store query criteria with query_id
gpu_queries = {}
# Endpoint to discover available GPUs
@app.route('/rdc/discovery', methods=['GET'])
def discover_gpus():
"""Retrieve a list of available GPUs and their names."""
try:
gpu_indexes = rdc_util.get_all_gpu_indexes(rdc_reader.rdc_handle)
gpus = {}
for gpu in gpu_indexes:
device_attr = rdc_device_attributes_t()
rdc.rdc_device_get_attributes(rdc_reader.rdc_handle, gpu, device_attr)
gpus[gpu] = device_attr.device_name.decode('utf-8') # Decode GPU name from bytes
return jsonify(gpus)
except Exception as e:
return jsonify({"error": str(e)}), 500
# Endpoint to create a new query criteria
@app.route('/rdc/query_criteria', methods=['POST'])
def create_query_criteria():
"""Define a new query criteria specifying GPU indices and metrics to monitor."""
try:
data = request.json
if not data or "metrics" not in data:
return jsonify({"error": "Invalid request payload"}), 400
gpu_indexes = data.get("gpu_index", rdc_util.get_all_gpu_indexes(rdc_reader.rdc_handle))
metrics = data.get("metrics", [])
# Create rdc group and fieldgroup
gpu_group_id, _ = rdc_util.create_gpu_group(rdc_reader.rdc_handle, b"query_gpu_group", gpu_indexes)
field_group_id, _ = rdc_util.create_field_group(rdc_reader.rdc_handle, b"query_field_group", [rdc.get_field_id_from_name(m.encode('utf-8')).value for m in metrics])
# Call rdc_field_watch to start fetching metrics into cache
result = rdc.rdc_field_watch(rdc_reader.rdc_handle, gpu_group_id, field_group_id, 1000000, 3600.0, 1000)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
return jsonify({"error": "Failed to watch fields"}), 500
query_id = f"G-{gpu_group_id.value}-F-{field_group_id.value}"
gpu_queries[query_id] = {"gpu_index": gpu_indexes, "metrics": metrics, "query_id": query_id}
return jsonify({"query_id": query_id})
except Exception as e:
return jsonify({"error": str(e)}), 500
# Endpoint to get all query criteria
@app.route('/rdc/query_criteria', methods=['GET'])
def get_all_query_criteria():
"""Retrieve all stored query criteria for all GPUs."""
try:
query_id = request.args.get("query_id")
if query_id:
return jsonify(gpu_queries.get(query_id, {}))
return jsonify(list(gpu_queries.values()))
except Exception as e:
return jsonify({"error": str(e)}), 500
# Endpoint to retrieve a specific query criteria
@app.route('/rdc/query_criteria/<query_id>', methods=['GET'])
def get_query_criteria(query_id):
"""Retrieve query criteria based on a given query ID."""
try:
if query_id in gpu_queries:
return jsonify(gpu_queries[query_id])
return jsonify({"error": "Query ID not found"}), 404
except Exception as e:
return jsonify({"error": str(e)}), 500
# Endpoint to delete a specific query criteria
@app.route('/rdc/query_criteria/<query_id>', methods=['DELETE'])
def delete_query_criteria(query_id):
"""Delete a query criteria using its query ID."""
try:
if query_id in gpu_queries:
gpu_group_id = rdc_reader.field_group_id
field_group_id = rdc_reader.field_group_id
# Call rdc_field_unwatch to stop fetching metrics
result = rdc.rdc_field_unwatch(rdc_reader.rdc_handle, gpu_group_id, field_group_id)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
return jsonify({"error": "Failed to unwatch fields"}), 500
# Delete GPU and field groups
rdc.rdc_group_gpu_destroy(rdc_reader.rdc_handle, gpu_group_id)
rdc.rdc_group_field_destroy(rdc_reader.rdc_handle, field_group_id)
# Remove the query from storage
del gpu_queries[query_id]
return jsonify({"message": "Deleted successfully"})
return jsonify({"error": "Query ID not found"}), 404
except Exception as e:
return jsonify({"error": str(e)}), 500
# Endpoint to fetch GPU metrics for a specific query ID
@app.route('/rdc/gpu_metrics/<query_id>', methods=['GET'])
def get_gpu_metrics(query_id):
"""Retrieve GPU metrics based on the query ID."""
try:
if query_id not in gpu_queries:
return jsonify({"error": "Query ID not found"}), 404
query = gpu_queries[query_id]
gpu_metrics = [] # List to store GPU metric results
for gpu in query["gpu_index"]:
gpu_data = {"gpu_index": gpu} # Store GPU index in the response
for metric in query["metrics"]:
field_id = rdc.get_field_id_from_name(metric.encode('utf-8')).value
value = rdc_field_value()
result = rdc.rdc_field_get_latest_value(rdc_reader.rdc_handle, gpu, field_id, value)
if rdc_status_t(result) == rdc_status_t.RDC_ST_OK:
gpu_data[metric] = value.value.l_int # Store metric value
gpu_metrics.append(gpu_data) # Append GPU data to results
return jsonify(gpu_metrics)
except Exception as e:
return jsonify({"error": str(e)}), 500
# Main entry point to start the Flask server
if __name__ == '__main__':
# Runs the API server, making it accessible on all network interfaces
app.run(host='0.0.0.0', port=50052)